|
@@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
|
|
|
import { Readable } from 'stream';
|
|
|
import { promisify } from 'util';
|
|
|
|
|
|
-import axios from 'axios';
|
|
|
+import axios, { ResponseType as AxiosResponseType } from 'axios';
|
|
|
import * as CallableInstance from 'callable-instance';
|
|
|
import { XmlEntities } from 'html-entities';
|
|
|
import { PNG } from 'pngjs';
|
|
@@ -13,7 +13,7 @@ import * as temp from 'temp';
|
|
|
import { getLogger } from './loggers';
|
|
|
import { Message } from './koishi';
|
|
|
import { MediaEntity, Tweet } from './twitter';
|
|
|
-import { chainPromises } from './utils';
|
|
|
+import { chainPromises, M3u8 } from './utils';
|
|
|
|
|
|
const xmlEntities = new XmlEntities();
|
|
|
|
|
@@ -30,6 +30,27 @@ const typeInZH = {
|
|
|
|
|
|
const logger = getLogger('webshot');
|
|
|
|
|
|
+const axiosGet = <T extends AxiosResponseType>(url: string, responseType: T, timeout = 150000) => {
|
|
|
+ logger.info(`fetching ${url}`);
|
|
|
+ return axios({
|
|
|
+ method: 'get',
|
|
|
+ url,
|
|
|
+ responseType,
|
|
|
+ timeout,
|
|
|
+ }).then(res => {
|
|
|
+ if (res.status === 200) {
|
|
|
+ logger.info(`successfully fetched ${url}`);
|
|
|
+ return res.data as {text: string, arraybuffer: ArrayBuffer, [k: string]: any}[T];
|
|
|
+ } else {
|
|
|
+ logger.error(`failed to fetch ${url}: ${res.status}`);
|
|
|
+ throw new Error();
|
|
|
+ }
|
|
|
+ }).catch (err => {
|
|
|
+ logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
|
|
|
+ throw new Error();
|
|
|
+ });
|
|
|
+};
|
|
|
+
|
|
|
class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Promise<void>> {
|
|
|
|
|
|
private browser: puppeteer.Browser;
|
|
@@ -115,7 +136,20 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
|
width: width / zoomFactor,
|
|
|
height: height / zoomFactor,
|
|
|
})
|
|
|
- .then(() => page.route('*:\/\/video.twimg.com\/**', route => route.abort()))
|
|
|
+ .then(() => page.route('*://video.twimg.com/**', route =>
|
|
|
+ route.abort().then(() => page.evaluate(videoUrl => {
|
|
|
+ let videoUrls: string[] = window['__scrapedVideoUrls'];
|
|
|
+ if (!videoUrls) videoUrls = window['__scrapedVideoUrls'] = [];
|
|
|
+ if (!videoUrls.includes(videoUrl)) {
|
|
|
+ videoUrls.push(videoUrl);
|
|
|
+ return videoUrl;
|
|
|
+ }
|
|
|
+ }, route.request().url())).then(videoUrl => {
|
|
|
+ if (videoUrl) logger.info(`scraped ${route.request().url()} from page`);
|
|
|
+ }).catch(err => {
|
|
|
+ logger.error(`error aborting request to ${route.request().url()}, error: ${err}`);
|
|
|
+ })
|
|
|
+ ))
|
|
|
.then(gotoUrlAndWaitForTweet)
|
|
|
// hide header, "more options" button, like and retweet count
|
|
|
.then(() => page.addStyleTag({
|
|
@@ -232,31 +266,15 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
|
);
|
|
|
};
|
|
|
|
|
|
- private fetchMedia = (url: string): Promise<string> => new Promise<ArrayBuffer>((resolve, reject) => {
|
|
|
- logger.info(`fetching ${url}`);
|
|
|
- axios({
|
|
|
- method: 'get',
|
|
|
- url,
|
|
|
- responseType: 'arraybuffer',
|
|
|
- timeout: 150000,
|
|
|
- }).then(res => {
|
|
|
- if (res.status === 200) {
|
|
|
- logger.info(`successfully fetched ${url}`);
|
|
|
- resolve(res.data);
|
|
|
- } else {
|
|
|
- logger.error(`failed to fetch ${url}: ${res.status}`);
|
|
|
- reject();
|
|
|
- }
|
|
|
- }).catch (err => {
|
|
|
- logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
|
|
|
- reject();
|
|
|
- });
|
|
|
- }).then(data =>
|
|
|
- (ext => {
|
|
|
- const mediaTempFilePath = temp.path({suffix: `.${ext}`});
|
|
|
- writeFileSync(mediaTempFilePath, Buffer.from(data));
|
|
|
- const path = `file://${mediaTempFilePath}`;
|
|
|
- switch (ext) {
|
|
|
+ private fetchMedia = (url: string) =>
|
|
|
+ (url.match(/^file:/) ? Promise.resolve(url) : axiosGet(url, 'arraybuffer').then(data =>
|
|
|
+ (ext => {
|
|
|
+ const mediaTempFilePath = temp.path({suffix: `.${ext}`});
|
|
|
+ writeFileSync(mediaTempFilePath, Buffer.from(data));
|
|
|
+ return `file://${mediaTempFilePath}`;
|
|
|
+ })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
|
|
|
+ )).then(path => {
|
|
|
+ switch ((/.*\.(.*?)$/.exec(path) || [])[1]) {
|
|
|
case 'jpg':
|
|
|
case 'png':
|
|
|
return Message.Image(path);
|
|
@@ -265,8 +283,7 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
|
}
|
|
|
logger.warn('unable to find MIME type of fetched media, failing this fetch');
|
|
|
throw Error();
|
|
|
- })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
|
|
|
- );
|
|
|
+ });
|
|
|
|
|
|
public webshot(
|
|
|
tweets: Tweet[],
|
|
@@ -305,15 +322,10 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
|
// invoke webshot
|
|
|
if (this.mode === 0) {
|
|
|
const url = `https://mobile.twitter.com/${twi.user.screen_name}/status/${twi.id_str}`;
|
|
|
- const extendEntity = (cardImg: MediaEntity) => {
|
|
|
- originTwi.extended_entities = {
|
|
|
- ...originTwi.extended_entities,
|
|
|
- media: [
|
|
|
- ...originTwi.extended_entities?.media ?? [],
|
|
|
- cardImg,
|
|
|
- ],
|
|
|
- };
|
|
|
- };
|
|
|
+ const extendEntity = (cardMedia: MediaEntity) =>
|
|
|
+ (media => {
|
|
|
+ if (!media.some(entity => entity.id_str === cardMedia.id_str)) media.push(cardMedia);
|
|
|
+ })((originTwi.extended_entities ||= {}).media ||= []);
|
|
|
const truncateLongThread = (atId: string) => {
|
|
|
if (!atId) return;
|
|
|
logger.info(`thread too long, truncating at tweet ${atId}...`);
|
|
@@ -340,28 +352,85 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
|
document.documentElement.scrollTop = 0;
|
|
|
}).then(truncateLongThread),
|
|
|
|
|
|
- // scrape card image from main tweet
|
|
|
- (_, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
|
|
|
- const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
|
|
|
- if (typeof cardImg?.getAttribute('src') === 'string') {
|
|
|
- const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg?.getAttribute('src'));
|
|
|
- if (match) {
|
|
|
- // tslint:disable-next-line: variable-name
|
|
|
- const [media_url_https, id_str] = match.slice(1);
|
|
|
- return {
|
|
|
+ // scrape card media from main tweet
|
|
|
+ (page, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
|
|
|
+ const cardMedia = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img, video');
|
|
|
+ let match: RegExpExecArray;
|
|
|
+ if (cardMedia?.tagName === 'IMG' && typeof cardMedia?.getAttribute('src') === 'string') {
|
|
|
+ match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardMedia?.getAttribute('src'));
|
|
|
+ }
|
|
|
+ if (cardMedia?.tagName === 'VIDEO' && typeof cardMedia?.getAttribute('poster') === 'string') {
|
|
|
+ match = /^(.*\/amplify_video_thumb\/(\d+)\/img\/.*$)/.exec(cardMedia?.getAttribute('poster'));
|
|
|
+ }
|
|
|
+ if (match) {
|
|
|
+ const [media_url_https, id_str] = match.slice(1);
|
|
|
+ return {
|
|
|
+ type: cardMedia.tagName,
|
|
|
+ entityBase: {
|
|
|
media_url: media_url_https.replace(/^https/, 'http'),
|
|
|
media_url_https,
|
|
|
url: '',
|
|
|
display_url: '',
|
|
|
expanded_url: '',
|
|
|
- type: 'photo',
|
|
|
id: Number(id_str),
|
|
|
id_str,
|
|
|
sizes: undefined,
|
|
|
- };
|
|
|
- }
|
|
|
+ }
|
|
|
+ };
|
|
|
}
|
|
|
- }).then(cardImg => { if (cardImg) extendEntity(cardImg); })
|
|
|
+ return {};
|
|
|
+ }).then(({type, entityBase}) => {
|
|
|
+ if (type === 'IMG') extendEntity({
|
|
|
+ ...entityBase,
|
|
|
+ type: 'photo',
|
|
|
+ });
|
|
|
+ if (type === 'VIDEO') page.evaluate(
|
|
|
+ id_str => (window['__scrapedVideoUrls'] as string[])?.filter(videoUrl =>
|
|
|
+ new RegExp(`.*/amplify_video/${id_str}.*\\.m3u8(?:\\?|$)`).exec(videoUrl)
|
|
|
+ ),
|
|
|
+ entityBase.id_str
|
|
|
+ ).then(videoUrls => {
|
|
|
+ if (videoUrls && videoUrls.length) {
|
|
|
+ Promise.all(videoUrls.map(streamlistUrl =>
|
|
|
+ axiosGet(streamlistUrl, 'text')
|
|
|
+ .then(streamlist => M3u8.parseStreamlist(streamlist)[0])
|
|
|
+ .then(({bandwidth, playlistPath, resolution}) => {
|
|
|
+ const [width, height] = /(.*)x(.*)/.exec(resolution).slice(1).map(Number);
|
|
|
+ const playlistUrl = new URL(playlistPath, streamlistUrl);
|
|
|
+ const mediaTempFilePath = temp.path({suffix: `.mp4`});
|
|
|
+ return axiosGet(playlistUrl.href, 'text')
|
|
|
+ .then(playlist => M3u8.parsePlaylist(playlist))
|
|
|
+ .then(({duration, segmentPaths}) =>
|
|
|
+ chainPromises(segmentPaths.map(path => () =>
|
|
|
+ axiosGet(new URL(path, playlistUrl).href, 'arraybuffer').then(data => {
|
|
|
+ writeFileSync(mediaTempFilePath, Buffer.from(data), {flag: 'a'});
|
|
|
+ })
|
|
|
+ )).then(() => ({
|
|
|
+ duration_millis: duration * 1000,
|
|
|
+ aspect_ratio: [width, height],
|
|
|
+ variants: [{
|
|
|
+ bitrate: bandwidth,
|
|
|
+ content_type: 'video/mp4',
|
|
|
+ url: `file://${mediaTempFilePath}`,
|
|
|
+ }]
|
|
|
+ }) as MediaEntity['video_info'])
|
|
|
+ )
|
|
|
+ })
|
|
|
+ )).then(videoInfos =>
|
|
|
+ videoInfos.reduce((vi1, vi2) => ({
|
|
|
+ ...vi1,
|
|
|
+ variants: vi1.variants.concat(vi2.variants)
|
|
|
+ }))
|
|
|
+ ).then(videoInfo => extendEntity({
|
|
|
+ ...entityBase,
|
|
|
+ type: 'video',
|
|
|
+ video_info: videoInfo,
|
|
|
+ })).catch(error => {
|
|
|
+ logger.warn('unable to fetch scraped video, ignoring...');
|
|
|
+ });
|
|
|
+ }
|
|
|
+ });
|
|
|
+ })
|
|
|
))
|
|
|
.then(fileurl => {
|
|
|
if (fileurl) return Message.Image(fileurl);
|