|
@@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
|
|
import { Readable } from 'stream';
|
|
import { Readable } from 'stream';
|
|
import { promisify } from 'util';
|
|
import { promisify } from 'util';
|
|
|
|
|
|
-import axios from 'axios';
|
|
|
|
|
|
+import axios, { ResponseType as AxiosResponseType } from 'axios';
|
|
import * as CallableInstance from 'callable-instance';
|
|
import * as CallableInstance from 'callable-instance';
|
|
import { XmlEntities } from 'html-entities';
|
|
import { XmlEntities } from 'html-entities';
|
|
import { PNG } from 'pngjs';
|
|
import { PNG } from 'pngjs';
|
|
@@ -13,7 +13,7 @@ import * as temp from 'temp';
|
|
import { getLogger } from './loggers';
|
|
import { getLogger } from './loggers';
|
|
import { Message } from './koishi';
|
|
import { Message } from './koishi';
|
|
import { MediaEntity, Tweet } from './twitter';
|
|
import { MediaEntity, Tweet } from './twitter';
|
|
-import { chainPromises } from './utils';
|
|
|
|
|
|
+import { chainPromises, M3u8 } from './utils';
|
|
|
|
|
|
const xmlEntities = new XmlEntities();
|
|
const xmlEntities = new XmlEntities();
|
|
|
|
|
|
@@ -30,6 +30,27 @@ const typeInZH = {
|
|
|
|
|
|
const logger = getLogger('webshot');
|
|
const logger = getLogger('webshot');
|
|
|
|
|
|
|
|
+const axiosGet = <T extends AxiosResponseType>(url: string, responseType: T, timeout = 150000) => {
|
|
|
|
+ logger.info(`fetching ${url}`);
|
|
|
|
+ return axios({
|
|
|
|
+ method: 'get',
|
|
|
|
+ url,
|
|
|
|
+ responseType,
|
|
|
|
+ timeout,
|
|
|
|
+ }).then(res => {
|
|
|
|
+ if (res.status === 200) {
|
|
|
|
+ logger.info(`successfully fetched ${url}`);
|
|
|
|
+ return res.data as {text: string, arraybuffer: ArrayBuffer, [k: string]: any}[T];
|
|
|
|
+ } else {
|
|
|
|
+ logger.error(`failed to fetch ${url}: ${res.status}`);
|
|
|
|
+ throw new Error();
|
|
|
|
+ }
|
|
|
|
+ }).catch (err => {
|
|
|
|
+ logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
|
|
|
|
+ throw new Error();
|
|
|
|
+ });
|
|
|
|
+};
|
|
|
|
+
|
|
class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Promise<void>> {
|
|
class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Promise<void>> {
|
|
|
|
|
|
private browser: puppeteer.Browser;
|
|
private browser: puppeteer.Browser;
|
|
@@ -95,25 +116,41 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
const startTime = new Date().getTime();
|
|
const startTime = new Date().getTime();
|
|
const getTimerTime = () => new Date().getTime() - startTime;
|
|
const getTimerTime = () => new Date().getTime() - startTime;
|
|
const getTimeout = () => Math.max(500, webshotDelay - getTimerTime());
|
|
const getTimeout = () => Math.max(500, webshotDelay - getTimerTime());
|
|
- const goto = () => page.goto(url, {waitUntil: 'load', timeout: Math.min(10000, getTimeout())}).catch(err => {
|
|
|
|
- if (err.name === 'TimeoutError' && webshotDelay > getTimerTime()) {
|
|
|
|
- logger.warn(`navigation timed out after ${getTimerTime()} ms, retrying...`);
|
|
|
|
- return goto();
|
|
|
|
- }
|
|
|
|
- throw err;
|
|
|
|
- });
|
|
|
|
|
|
+ const gotoUrlAndWaitForTweet = () =>
|
|
|
|
+ page.goto(url, {waitUntil: 'load', timeout: Math.min(10000, getTimeout())})
|
|
|
|
+ .then(() => Promise.race([
|
|
|
|
+ page.waitForSelector('article', {state: 'attached', timeout: getTimeout()}),
|
|
|
|
+ page.click('#placeholder+#ScriptLoadFailure input[value="Try again"]', {timeout: getTimeout()}),
|
|
|
|
+ page.waitForSelector(
|
|
|
|
+ 'div[role="button"]>div>span>:text-matches("^やりなおす|更新$")'
|
|
|
|
+ , {state: 'attached', timeout: getTimeout()}).then(() => page.reload({timeout: getTimeout()})),
|
|
|
|
+ ]))
|
|
|
|
+ .catch(err => {
|
|
|
|
+ if (err.name === 'TimeoutError' && webshotDelay > getTimerTime()) {
|
|
|
|
+ logger.warn(`navigation timed out after ${getTimerTime()} ms, retrying...`);
|
|
|
|
+ return gotoUrlAndWaitForTweet();
|
|
|
|
+ }
|
|
|
|
+ throw err;
|
|
|
|
+ });
|
|
page.setViewportSize({
|
|
page.setViewportSize({
|
|
width: width / zoomFactor,
|
|
width: width / zoomFactor,
|
|
height: height / zoomFactor,
|
|
height: height / zoomFactor,
|
|
})
|
|
})
|
|
- .then(() => page.route('*:\/\/video.twimg.com\/**', route => route.abort()))
|
|
|
|
- .then(goto)
|
|
|
|
- .then(() => Promise.race([
|
|
|
|
- page.waitForSelector('article', {state: 'attached', timeout: getTimeout()}),
|
|
|
|
- page.click('#placeholder+#ScriptLoadFailure input[value="Try again"]', {timeout: getTimeout()}),
|
|
|
|
- page.waitForSelector('div[role="button"]>div>span>:text-matches("^やりなおす|更新$")', {state: 'attached', timeout: getTimeout()})
|
|
|
|
- .then(() => page.reload({timeout: getTimeout()})),
|
|
|
|
- ]))
|
|
|
|
|
|
+ .then(() => page.route('*://video.twimg.com/**', route =>
|
|
|
|
+ route.abort().then(() => page.evaluate(videoUrl => {
|
|
|
|
+ let videoUrls: string[] = window['__scrapedVideoUrls'];
|
|
|
|
+ if (!videoUrls) videoUrls = window['__scrapedVideoUrls'] = [];
|
|
|
|
+ if (!videoUrls.includes(videoUrl)) {
|
|
|
|
+ videoUrls.push(videoUrl);
|
|
|
|
+ return videoUrl;
|
|
|
|
+ }
|
|
|
|
+ }, route.request().url())).then(videoUrl => {
|
|
|
|
+ if (videoUrl) logger.info(`scraped ${route.request().url()} from page`);
|
|
|
|
+ }).catch(err => {
|
|
|
|
+ logger.error(`error aborting request to ${route.request().url()}, error: ${err}`);
|
|
|
|
+ })
|
|
|
|
+ ))
|
|
|
|
+ .then(gotoUrlAndWaitForTweet)
|
|
// hide header, "more options" button, like and retweet count
|
|
// hide header, "more options" button, like and retweet count
|
|
.then(() => page.addStyleTag({
|
|
.then(() => page.addStyleTag({
|
|
content: 'header,#layers{display:none!important}article{background-color:transparent!important}' +
|
|
content: 'header,#layers{display:none!important}article{background-color:transparent!important}' +
|
|
@@ -163,7 +200,9 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
const path = temp.path({suffix: '.html'});
|
|
const path = temp.path({suffix: '.html'});
|
|
writeFileSync(path, html);
|
|
writeFileSync(path, html);
|
|
logger.warn(`saved debug html to ${path}`);
|
|
logger.warn(`saved debug html to ${path}`);
|
|
- }).then(() => page.route('**/*', route => route.abort())
|
|
|
|
|
|
+ }).then(() => page.route('**/*', route => route.abort().catch(err => {
|
|
|
|
+ logger.error(`error aborting all requests for debug screenshot, error: ${err}`);
|
|
|
|
+ }))
|
|
).then(() => page.screenshot({fullPage: true})).then(screenshot => {
|
|
).then(() => page.screenshot({fullPage: true})).then(screenshot => {
|
|
sharpToFile(sharp(screenshot).jpeg({ quality: 90 })).then(fileUri => {
|
|
sharpToFile(sharp(screenshot).jpeg({ quality: 90 })).then(fileUri => {
|
|
logger.warn(`saved debug screenshot to ${fileUri.substring(7)}`);
|
|
logger.warn(`saved debug screenshot to ${fileUri.substring(7)}`);
|
|
@@ -172,8 +211,12 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
})
|
|
})
|
|
.then(handle => {
|
|
.then(handle => {
|
|
if (handle === null) throw new puppeteer.errors.TimeoutError();
|
|
if (handle === null) throw new puppeteer.errors.TimeoutError();
|
|
|
|
+ let cropTop: number;
|
|
return chainPromises(morePostProcessings.map(func => () => func(page, handle)))
|
|
return chainPromises(morePostProcessings.map(func => () => func(page, handle)))
|
|
.then(() => promisify(setTimeout)(getTimeout()))
|
|
.then(() => promisify(setTimeout)(getTimeout()))
|
|
|
|
+ // determine screenshot crop y offset
|
|
|
|
+ .then(() => page.evaluate(() => document.documentElement.scrollTop))
|
|
|
|
+ .then(scrollTop => { cropTop = scrollTop * zoomFactor; })
|
|
// hide highlight of retweet header
|
|
// hide highlight of retweet header
|
|
.then(() => page.evaluate(() => (document.activeElement as unknown as HTMLOrSVGElement).blur()))
|
|
.then(() => page.evaluate(() => (document.activeElement as unknown as HTMLOrSVGElement).blur()))
|
|
// determine screenshot height
|
|
// determine screenshot height
|
|
@@ -187,16 +230,23 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
logger.error(`error while parsing content height, failing this webshot`);
|
|
logger.error(`error while parsing content height, failing this webshot`);
|
|
throw err;
|
|
throw err;
|
|
})
|
|
})
|
|
- .then(parentDivHandle => parentDivHandle.screenshot());
|
|
|
|
|
|
+ .then(parentDivHandle => parentDivHandle.screenshot())
|
|
|
|
+ .then<[Buffer, number]>(screenshot => [screenshot, cropTop]);
|
|
})
|
|
})
|
|
- .then(screenshot => {
|
|
|
|
|
|
+ .then(([screenshot, cropTop]) => {
|
|
new PNG({
|
|
new PNG({
|
|
filterType: 4,
|
|
filterType: 4,
|
|
deflateLevel: 0,
|
|
deflateLevel: 0,
|
|
}).on('parsed', function () {
|
|
}).on('parsed', function () {
|
|
- sharpToFile(jpeg(this.pack())).then(path => {
|
|
|
|
|
|
+ let png = this;
|
|
|
|
+ if (cropTop > 0) {
|
|
|
|
+ logger.info(`cropping screenshot at y offset ${cropTop}...`);
|
|
|
|
+ png = new PNG({width: this.width, height: this.height - cropTop});
|
|
|
|
+ this.bitblt(png, 0, cropTop, png.width, png.height, 0, 0);
|
|
|
|
+ }
|
|
|
|
+ sharpToFile(jpeg(png.pack())).then(path => {
|
|
logger.info(`finished webshot for ${url}`);
|
|
logger.info(`finished webshot for ${url}`);
|
|
- resolve({path, boundary: this.height});
|
|
|
|
|
|
+ resolve({path, boundary: png.height});
|
|
});
|
|
});
|
|
}).parse(screenshot);
|
|
}).parse(screenshot);
|
|
})
|
|
})
|
|
@@ -218,31 +268,15 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
);
|
|
);
|
|
};
|
|
};
|
|
|
|
|
|
- private fetchMedia = (url: string): Promise<string> => new Promise<ArrayBuffer>((resolve, reject) => {
|
|
|
|
- logger.info(`fetching ${url}`);
|
|
|
|
- axios({
|
|
|
|
- method: 'get',
|
|
|
|
- url,
|
|
|
|
- responseType: 'arraybuffer',
|
|
|
|
- timeout: 150000,
|
|
|
|
- }).then(res => {
|
|
|
|
- if (res.status === 200) {
|
|
|
|
- logger.info(`successfully fetched ${url}`);
|
|
|
|
- resolve(res.data);
|
|
|
|
- } else {
|
|
|
|
- logger.error(`failed to fetch ${url}: ${res.status}`);
|
|
|
|
- reject();
|
|
|
|
- }
|
|
|
|
- }).catch (err => {
|
|
|
|
- logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
|
|
|
|
- reject();
|
|
|
|
- });
|
|
|
|
- }).then(data =>
|
|
|
|
- (ext => {
|
|
|
|
- const mediaTempFilePath = temp.path({suffix: `.${ext}`});
|
|
|
|
- writeFileSync(mediaTempFilePath, Buffer.from(data));
|
|
|
|
- const path = `file://${mediaTempFilePath}`;
|
|
|
|
- switch (ext) {
|
|
|
|
|
|
+ private fetchMedia = (url: string) =>
|
|
|
|
+ (url.match(/^file:/) ? Promise.resolve(url) : axiosGet(url, 'arraybuffer').then(data =>
|
|
|
|
+ (ext => {
|
|
|
|
+ const mediaTempFilePath = temp.path({suffix: `.${ext}`});
|
|
|
|
+ writeFileSync(mediaTempFilePath, Buffer.from(data));
|
|
|
|
+ return `file://${mediaTempFilePath}`;
|
|
|
|
+ })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
|
|
|
|
+ )).then(path => {
|
|
|
|
+ switch ((/.*\.(.*?)$/.exec(path) || [])[1]) {
|
|
case 'jpg':
|
|
case 'jpg':
|
|
case 'png':
|
|
case 'png':
|
|
return Message.Image(path);
|
|
return Message.Image(path);
|
|
@@ -251,8 +285,7 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
}
|
|
}
|
|
logger.warn('unable to find MIME type of fetched media, failing this fetch');
|
|
logger.warn('unable to find MIME type of fetched media, failing this fetch');
|
|
throw Error();
|
|
throw Error();
|
|
- })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
|
|
|
|
- );
|
|
|
|
|
|
+ });
|
|
|
|
|
|
public webshot(
|
|
public webshot(
|
|
tweets: Tweet[],
|
|
tweets: Tweet[],
|
|
@@ -291,15 +324,10 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
// invoke webshot
|
|
// invoke webshot
|
|
if (this.mode === 0) {
|
|
if (this.mode === 0) {
|
|
const url = `https://mobile.twitter.com/${twi.user.screen_name}/status/${twi.id_str}`;
|
|
const url = `https://mobile.twitter.com/${twi.user.screen_name}/status/${twi.id_str}`;
|
|
- const extendEntity = (cardImg: MediaEntity) => {
|
|
|
|
- originTwi.extended_entities = {
|
|
|
|
- ...originTwi.extended_entities,
|
|
|
|
- media: [
|
|
|
|
- ...originTwi.extended_entities?.media ?? [],
|
|
|
|
- cardImg,
|
|
|
|
- ],
|
|
|
|
- };
|
|
|
|
- };
|
|
|
|
|
|
+ const extendEntity = (cardMedia: MediaEntity) =>
|
|
|
|
+ (media => {
|
|
|
|
+ if (!media.some(entity => entity.id_str === cardMedia.id_str)) media.push(cardMedia);
|
|
|
|
+ })((originTwi.extended_entities ||= {}).media ||= []);
|
|
const truncateLongThread = (atId: string) => {
|
|
const truncateLongThread = (atId: string) => {
|
|
if (!atId) return;
|
|
if (!atId) return;
|
|
logger.info(`thread too long, truncating at tweet ${atId}...`);
|
|
logger.info(`thread too long, truncating at tweet ${atId}...`);
|
|
@@ -326,28 +354,85 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
|
|
document.documentElement.scrollTop = 0;
|
|
document.documentElement.scrollTop = 0;
|
|
}).then(truncateLongThread),
|
|
}).then(truncateLongThread),
|
|
|
|
|
|
- // scrape card image from main tweet
|
|
|
|
- (_, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
|
|
|
|
- const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
|
|
|
|
- if (typeof cardImg?.getAttribute('src') === 'string') {
|
|
|
|
- const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg?.getAttribute('src'));
|
|
|
|
- if (match) {
|
|
|
|
- // tslint:disable-next-line: variable-name
|
|
|
|
- const [media_url_https, id_str] = match.slice(1);
|
|
|
|
- return {
|
|
|
|
|
|
+ // scrape card media from main tweet
|
|
|
|
+ (page, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
|
|
|
|
+ const cardMedia = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img, video');
|
|
|
|
+ let match: RegExpExecArray;
|
|
|
|
+ if (cardMedia?.tagName === 'IMG' && typeof cardMedia?.getAttribute('src') === 'string') {
|
|
|
|
+ match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardMedia?.getAttribute('src'));
|
|
|
|
+ }
|
|
|
|
+ if (cardMedia?.tagName === 'VIDEO' && typeof cardMedia?.getAttribute('poster') === 'string') {
|
|
|
|
+ match = /^(.*\/amplify_video_thumb\/(\d+)\/img\/.*$)/.exec(cardMedia?.getAttribute('poster'));
|
|
|
|
+ }
|
|
|
|
+ if (match) {
|
|
|
|
+ const [media_url_https, id_str] = match.slice(1);
|
|
|
|
+ return {
|
|
|
|
+ type: cardMedia.tagName,
|
|
|
|
+ entityBase: {
|
|
media_url: media_url_https.replace(/^https/, 'http'),
|
|
media_url: media_url_https.replace(/^https/, 'http'),
|
|
media_url_https,
|
|
media_url_https,
|
|
url: '',
|
|
url: '',
|
|
display_url: '',
|
|
display_url: '',
|
|
expanded_url: '',
|
|
expanded_url: '',
|
|
- type: 'photo',
|
|
|
|
id: Number(id_str),
|
|
id: Number(id_str),
|
|
id_str,
|
|
id_str,
|
|
sizes: undefined,
|
|
sizes: undefined,
|
|
- };
|
|
|
|
- }
|
|
|
|
|
|
+ }
|
|
|
|
+ };
|
|
}
|
|
}
|
|
- }).then(cardImg => { if (cardImg) extendEntity(cardImg); })
|
|
|
|
|
|
+ return {};
|
|
|
|
+ }).then(({type, entityBase}) => {
|
|
|
|
+ if (type === 'IMG') extendEntity({
|
|
|
|
+ ...entityBase,
|
|
|
|
+ type: 'photo',
|
|
|
|
+ });
|
|
|
|
+ if (type === 'VIDEO') page.evaluate(
|
|
|
|
+ id_str => (window['__scrapedVideoUrls'] as string[])?.filter(videoUrl =>
|
|
|
|
+ new RegExp(`.*/amplify_video/${id_str}.*\\.m3u8(?:\\?|$)`).exec(videoUrl)
|
|
|
|
+ ),
|
|
|
|
+ entityBase.id_str
|
|
|
|
+ ).then(videoUrls => {
|
|
|
|
+ if (videoUrls && videoUrls.length) {
|
|
|
|
+ Promise.all(videoUrls.map(streamlistUrl =>
|
|
|
|
+ axiosGet(streamlistUrl, 'text')
|
|
|
|
+ .then(streamlist => M3u8.parseStreamlist(streamlist)[0])
|
|
|
|
+ .then(({bandwidth, playlistPath, resolution}) => {
|
|
|
|
+ const [width, height] = /(.*)x(.*)/.exec(resolution).slice(1).map(Number);
|
|
|
|
+ const playlistUrl = new URL(playlistPath, streamlistUrl);
|
|
|
|
+ const mediaTempFilePath = temp.path({suffix: `.mp4`});
|
|
|
|
+ return axiosGet(playlistUrl.href, 'text')
|
|
|
|
+ .then(playlist => M3u8.parsePlaylist(playlist))
|
|
|
|
+ .then(({duration, segmentPaths}) =>
|
|
|
|
+ chainPromises(segmentPaths.map(path => () =>
|
|
|
|
+ axiosGet(new URL(path, playlistUrl).href, 'arraybuffer').then(data => {
|
|
|
|
+ writeFileSync(mediaTempFilePath, Buffer.from(data), {flag: 'a'});
|
|
|
|
+ })
|
|
|
|
+ )).then(() => ({
|
|
|
|
+ duration_millis: duration * 1000,
|
|
|
|
+ aspect_ratio: [width, height],
|
|
|
|
+ variants: [{
|
|
|
|
+ bitrate: bandwidth,
|
|
|
|
+ content_type: 'video/mp4',
|
|
|
|
+ url: `file://${mediaTempFilePath}`,
|
|
|
|
+ }]
|
|
|
|
+ }) as MediaEntity['video_info'])
|
|
|
|
+ )
|
|
|
|
+ })
|
|
|
|
+ )).then(videoInfos =>
|
|
|
|
+ videoInfos.reduce((vi1, vi2) => ({
|
|
|
|
+ ...vi1,
|
|
|
|
+ variants: vi1.variants.concat(vi2.variants)
|
|
|
|
+ }))
|
|
|
|
+ ).then(videoInfo => extendEntity({
|
|
|
|
+ ...entityBase,
|
|
|
|
+ type: 'video',
|
|
|
|
+ video_info: videoInfo,
|
|
|
|
+ })).catch(error => {
|
|
|
|
+ logger.warn('unable to fetch scraped video, ignoring...');
|
|
|
|
+ });
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+ })
|
|
))
|
|
))
|
|
.then(fileurl => {
|
|
.then(fileurl => {
|
|
if (fileurl) return Message.Image(fileurl);
|
|
if (fileurl) return Message.Image(fileurl);
|