Kaynağa Gözat

extend media entites from page: m4s video/dedup

Mike L 3 yıl önce
ebeveyn
işleme
f43faf0488
4 değiştirilmiş dosya ile 271 ekleme ve 103 silme
  1. 22 1
      dist/utils.js
  2. 104 50
      dist/webshot.js
  3. 24 0
      src/utils.ts
  4. 121 52
      src/webshot.ts

+ 22 - 1
dist/utils.js

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.BigNumOps = exports.chainPromises = void 0;
+exports.M3u8 = exports.BigNumOps = exports.chainPromises = void 0;
 const chainPromises = (lazyPromises, reducer = (lp1, lp2) => (p) => lp1(p).then(lp2), initialValue) => lazyPromises.reduce(reducer, p => Promise.resolve(p))(initialValue);
 exports.chainPromises = chainPromises;
 const splitBigNumAt = (num, at) => num.replace(RegExp(String.raw `^([+-]?)(\d+)(\d{${at}})$`), '$1$2,$1$3')
@@ -49,3 +49,24 @@ exports.BigNumOps = {
     lShift: bigNumLShift,
     parse: parseBigNum,
 };
+const parseStreamlist = (str) => {
+    const variants = [];
+    const regex = /#EXT-X-STREAM-INF:.*BANDWIDTH=(.*),RESOLUTION=(.*),.*\n(.*)/g;
+    let match;
+    while (match = regex.exec(str)) {
+        variants.push({ bandwidth: Number(match[1]), resolution: match[2], playlistPath: match[3] });
+    }
+    return variants;
+};
+const parsePlaylist = (str) => {
+    const segmentRegex = /#EXTINF:(.*),\n(.*)/g;
+    const segmentPaths = [(/#EXT-X-MAP:URI="(.*)"/.exec(str) || [])[1]];
+    let match;
+    let duration = 0;
+    while (match = segmentRegex.exec(str)) {
+        duration += Number(match[1]);
+        segmentPaths.push(match[2]);
+    }
+    return { duration, segmentPaths };
+};
+exports.M3u8 = { parseStreamlist, parsePlaylist };

+ 104 - 50
dist/webshot.js

@@ -26,6 +26,27 @@ const typeInZH = {
     animated_gif: ZHType('GIF'),
 };
 const logger = (0, loggers_1.getLogger)('webshot');
+const axiosGet = (url, responseType, timeout = 150000) => {
+    logger.info(`fetching ${url}`);
+    return (0, axios_1.default)({
+        method: 'get',
+        url,
+        responseType,
+        timeout,
+    }).then(res => {
+        if (res.status === 200) {
+            logger.info(`successfully fetched ${url}`);
+            return res.data;
+        }
+        else {
+            logger.error(`failed to fetch ${url}: ${res.status}`);
+            throw new Error();
+        }
+    }).catch(err => {
+        logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
+        throw new Error();
+    });
+};
 class Webshot extends CallableInstance {
     constructor(wsUrl, mode, onready) {
         super('webshot');
@@ -88,7 +109,20 @@ class Webshot extends CallableInstance {
                         width: width / zoomFactor,
                         height: height / zoomFactor,
                     })
-                        .then(() => page.route('*:\/\/video.twimg.com\/**', route => route.abort()))
+                        .then(() => page.route('*://video.twimg.com/**', route => route.abort().then(() => page.evaluate(videoUrl => {
+                        let videoUrls = window['__scrapedVideoUrls'];
+                        if (!videoUrls)
+                            videoUrls = window['__scrapedVideoUrls'] = [];
+                        if (!videoUrls.includes(videoUrl)) {
+                            videoUrls.push(videoUrl);
+                            return videoUrl;
+                        }
+                    }, route.request().url())).then(videoUrl => {
+                        if (videoUrl)
+                            logger.info(`scraped ${route.request().url()} from page`);
+                    }).catch(err => {
+                        logger.error(`error aborting request to ${route.request().url()}, error: ${err}`);
+                    })))
                         .then(gotoUrlAndWaitForTweet)
                         .then(() => page.addStyleTag({
                         content: 'header,#layers{display:none!important}article{background-color:transparent!important}' +
@@ -193,42 +227,23 @@ class Webshot extends CallableInstance {
             }).catch(error => this.reconnect(error)
                 .then(() => this.renderWebshot(url, height, webshotDelay, ...morePostProcessings)));
         };
-        this.fetchMedia = (url) => new Promise((resolve, reject) => {
-            logger.info(`fetching ${url}`);
-            (0, axios_1.default)({
-                method: 'get',
-                url,
-                responseType: 'arraybuffer',
-                timeout: 150000,
-            }).then(res => {
-                if (res.status === 200) {
-                    logger.info(`successfully fetched ${url}`);
-                    resolve(res.data);
-                }
-                else {
-                    logger.error(`failed to fetch ${url}: ${res.status}`);
-                    reject();
-                }
-            }).catch(err => {
-                logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
-                reject();
-            });
-        }).then(data => {
+        this.fetchMedia = (url) => (url.match(/^file:/) ? Promise.resolve(url) : axiosGet(url, 'arraybuffer').then(data => {
             var _a;
             return (ext => {
                 const mediaTempFilePath = temp.path({ suffix: `.${ext}` });
                 (0, fs_1.writeFileSync)(mediaTempFilePath, Buffer.from(data));
-                const path = `file://${mediaTempFilePath}`;
-                switch (ext) {
-                    case 'jpg':
-                    case 'png':
-                        return koishi_1.Message.Image(path);
-                    case 'mp4':
-                        return koishi_1.Message.Video(path);
-                }
-                logger.warn('unable to find MIME type of fetched media, failing this fetch');
-                throw Error();
+                return `file://${mediaTempFilePath}`;
             })(((_a = (/\?format=([a-z]+)&/.exec(url))) !== null && _a !== void 0 ? _a : (/.*\/.*\.([^?]+)/.exec(url)))[1]);
+        })).then(path => {
+            switch ((/.*\.(.*?)$/.exec(path) || [])[1]) {
+                case 'jpg':
+                case 'png':
+                    return koishi_1.Message.Image(path);
+                case 'mp4':
+                    return koishi_1.Message.Video(path);
+            }
+            logger.warn('unable to find MIME type of fetched media, failing this fetch');
+            throw Error();
         });
         if (this.mode = mode) {
             onready();
@@ -267,12 +282,12 @@ class Webshot extends CallableInstance {
             });
             if (this.mode === 0) {
                 const url = `https://mobile.twitter.com/${twi.user.screen_name}/status/${twi.id_str}`;
-                const extendEntity = (cardImg) => {
-                    var _a, _b;
-                    originTwi.extended_entities = Object.assign(Object.assign({}, originTwi.extended_entities), { media: [
-                            ...(_b = (_a = originTwi.extended_entities) === null || _a === void 0 ? void 0 : _a.media) !== null && _b !== void 0 ? _b : [],
-                            cardImg,
-                        ] });
+                const extendEntity = (cardMedia) => {
+                    var _a;
+                    return (media => {
+                        if (!media.some(entity => entity.id_str === cardMedia.id_str))
+                            media.push(cardMedia);
+                    })((_a = (originTwi.extended_entities || (originTwi.extended_entities = {}))).media || (_a.media = []));
                 };
                 const truncateLongThread = (atId) => {
                     if (!atId)
@@ -296,27 +311,66 @@ class Webshot extends CallableInstance {
                     }
                     catch (_a) { }
                     document.documentElement.scrollTop = 0;
-                }).then(truncateLongThread), (_, tweetHandle) => tweetHandle.evaluate(div => {
-                    const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
-                    if (typeof (cardImg === null || cardImg === void 0 ? void 0 : cardImg.getAttribute('src')) === 'string') {
-                        const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg === null || cardImg === void 0 ? void 0 : cardImg.getAttribute('src'));
-                        if (match) {
-                            const [media_url_https, id_str] = match.slice(1);
-                            return {
+                }).then(truncateLongThread), (page, tweetHandle) => tweetHandle.evaluate(div => {
+                    const cardMedia = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img, video');
+                    let match;
+                    if ((cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.tagName) === 'IMG' && typeof (cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.getAttribute('src')) === 'string') {
+                        match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.getAttribute('src'));
+                    }
+                    if ((cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.tagName) === 'VIDEO' && typeof (cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.getAttribute('poster')) === 'string') {
+                        match = /^(.*\/amplify_video_thumb\/(\d+)\/img\/.*$)/.exec(cardMedia === null || cardMedia === void 0 ? void 0 : cardMedia.getAttribute('poster'));
+                    }
+                    if (match) {
+                        const [media_url_https, id_str] = match.slice(1);
+                        return {
+                            type: cardMedia.tagName,
+                            entityBase: {
                                 media_url: media_url_https.replace(/^https/, 'http'),
                                 media_url_https,
                                 url: '',
                                 display_url: '',
                                 expanded_url: '',
-                                type: 'photo',
                                 id: Number(id_str),
                                 id_str,
                                 sizes: undefined,
-                            };
-                        }
+                            }
+                        };
                     }
-                }).then(cardImg => { if (cardImg)
-                    extendEntity(cardImg); })))
+                    return {};
+                }).then(({ type, entityBase }) => {
+                    if (type === 'IMG')
+                        extendEntity(Object.assign(Object.assign({}, entityBase), { type: 'photo' }));
+                    if (type === 'VIDEO')
+                        page.evaluate(id_str => {
+                            var _a;
+                            return (_a = window['__scrapedVideoUrls']) === null || _a === void 0 ? void 0 : _a.filter(videoUrl => new RegExp(`.*/amplify_video/${id_str}.*\\.m3u8(?:\\?|$)`).exec(videoUrl));
+                        }, entityBase.id_str).then(videoUrls => {
+                            if (videoUrls && videoUrls.length) {
+                                Promise.all(videoUrls.map(streamlistUrl => axiosGet(streamlistUrl, 'text')
+                                    .then(streamlist => utils_1.M3u8.parseStreamlist(streamlist)[0])
+                                    .then(({ bandwidth, playlistPath, resolution }) => {
+                                    const [width, height] = /(.*)x(.*)/.exec(resolution).slice(1).map(Number);
+                                    const playlistUrl = new URL(playlistPath, streamlistUrl);
+                                    const mediaTempFilePath = temp.path({ suffix: `.mp4` });
+                                    return axiosGet(playlistUrl.href, 'text')
+                                        .then(playlist => utils_1.M3u8.parsePlaylist(playlist))
+                                        .then(({ duration, segmentPaths }) => (0, utils_1.chainPromises)(segmentPaths.map(path => () => axiosGet(new URL(path, playlistUrl).href, 'arraybuffer').then(data => {
+                                        (0, fs_1.writeFileSync)(mediaTempFilePath, Buffer.from(data), { flag: 'a' });
+                                    }))).then(() => ({
+                                        duration_millis: duration * 1000,
+                                        aspect_ratio: [width, height],
+                                        variants: [{
+                                                bitrate: bandwidth,
+                                                content_type: 'video/mp4',
+                                                url: `file://${mediaTempFilePath}`,
+                                            }]
+                                    })));
+                                }))).then(videoInfos => videoInfos.reduce((vi1, vi2) => (Object.assign(Object.assign({}, vi1), { variants: vi1.variants.concat(vi2.variants) })))).then(videoInfo => extendEntity(Object.assign(Object.assign({}, entityBase), { type: 'video', video_info: videoInfo }))).catch(error => {
+                                    logger.warn('unable to fetch scraped video, ignoring...');
+                                });
+                            }
+                        });
+                })))
                     .then(fileurl => {
                     if (fileurl)
                         return koishi_1.Message.Image(fileurl);

+ 24 - 0
src/utils.ts

@@ -53,3 +53,27 @@ export const BigNumOps = {
   lShift: bigNumLShift,
   parse: parseBigNum,
 };
+
+const parseStreamlist = (str: string) => {
+  const variants: {bandwidth: number, resolution: string, playlistPath: string}[] = [];
+  const regex = /#EXT-X-STREAM-INF:.*BANDWIDTH=(.*),RESOLUTION=(.*),.*\n(.*)/g;
+  let match: RegExpExecArray;
+  while (match = regex.exec(str)) {
+    variants.push({bandwidth: Number(match[1]), resolution: match[2], playlistPath: match[3]})
+  }
+  return variants;
+};
+
+const parsePlaylist = (str: string) => {
+  const segmentRegex = /#EXTINF:(.*),\n(.*)/g;
+  const segmentPaths: string[] = [(/#EXT-X-MAP:URI="(.*)"/.exec(str) || [])[1]];
+  let match: RegExpExecArray;
+  let duration = 0;
+  while (match = segmentRegex.exec(str)) {
+    duration += Number(match[1]);
+    segmentPaths.push(match[2]);
+  }
+  return {duration, segmentPaths};
+};
+
+export const M3u8 = {parseStreamlist, parsePlaylist};

+ 121 - 52
src/webshot.ts

@@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
 import { Readable } from 'stream';
 import { promisify } from 'util';
 
-import axios from 'axios';
+import axios, { ResponseType as AxiosResponseType } from 'axios';
 import * as CallableInstance from 'callable-instance';
 import { XmlEntities } from 'html-entities';
 import { PNG } from 'pngjs';
@@ -13,7 +13,7 @@ import * as temp from 'temp';
 import { getLogger } from './loggers';
 import { Message } from './koishi';
 import { MediaEntity, Tweet } from './twitter';
-import { chainPromises } from './utils';
+import { chainPromises, M3u8 } from './utils';
 
 const xmlEntities = new XmlEntities();
 
@@ -30,6 +30,27 @@ const typeInZH = {
 
 const logger = getLogger('webshot');
 
+const axiosGet = <T extends AxiosResponseType>(url: string, responseType: T, timeout = 150000) => {
+  logger.info(`fetching ${url}`);
+  return axios({
+    method: 'get',
+    url,
+    responseType,
+    timeout,
+  }).then(res => {
+    if (res.status === 200) {
+      logger.info(`successfully fetched ${url}`);
+      return res.data as {text: string, arraybuffer: ArrayBuffer, [k: string]: any}[T];
+    } else {
+      logger.error(`failed to fetch ${url}: ${res.status}`);
+      throw new Error();
+    }
+  }).catch (err => {
+    logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
+    throw new Error();
+  });
+};
+
 class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Promise<void>> {
 
   private browser: puppeteer.Browser;
@@ -115,7 +136,20 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
             width: width / zoomFactor,
             height: height / zoomFactor,
           })
-            .then(() => page.route('*:\/\/video.twimg.com\/**', route => route.abort()))
+            .then(() => page.route('*://video.twimg.com/**', route =>
+              route.abort().then(() => page.evaluate(videoUrl => {
+                let videoUrls: string[] = window['__scrapedVideoUrls'];
+                if (!videoUrls) videoUrls = window['__scrapedVideoUrls'] = [];
+                if (!videoUrls.includes(videoUrl)) {
+                  videoUrls.push(videoUrl);
+                  return videoUrl;
+                }
+              }, route.request().url())).then(videoUrl => {
+                if (videoUrl) logger.info(`scraped ${route.request().url()} from page`);
+              }).catch(err => {
+                logger.error(`error aborting request to ${route.request().url()}, error: ${err}`);
+              })
+            ))
             .then(gotoUrlAndWaitForTweet)
             // hide header, "more options" button, like and retweet count
             .then(() => page.addStyleTag({
@@ -232,31 +266,15 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
     );
   };
 
-  private fetchMedia = (url: string): Promise<string> => new Promise<ArrayBuffer>((resolve, reject) => {
-    logger.info(`fetching ${url}`);
-    axios({
-      method: 'get',
-      url,
-      responseType: 'arraybuffer',
-      timeout: 150000,
-    }).then(res => {
-      if (res.status === 200) {
-        logger.info(`successfully fetched ${url}`);
-        resolve(res.data);
-      } else {
-        logger.error(`failed to fetch ${url}: ${res.status}`);
-        reject();
-      }
-    }).catch (err => {
-      logger.error(`failed to fetch ${url}: ${err instanceof Error ? err.message : err}`);
-      reject();
-    });
-  }).then(data =>
-    (ext => {
-      const mediaTempFilePath = temp.path({suffix: `.${ext}`});
-      writeFileSync(mediaTempFilePath, Buffer.from(data));
-      const path = `file://${mediaTempFilePath}`;
-      switch (ext) {
+  private fetchMedia = (url: string) =>
+    (url.match(/^file:/) ? Promise.resolve(url) : axiosGet(url, 'arraybuffer').then(data =>
+      (ext => {
+        const mediaTempFilePath = temp.path({suffix: `.${ext}`});
+        writeFileSync(mediaTempFilePath, Buffer.from(data));
+        return `file://${mediaTempFilePath}`;
+      })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
+    )).then(path => {
+      switch ((/.*\.(.*?)$/.exec(path) || [])[1]) {
         case 'jpg':
         case 'png':
           return Message.Image(path);
@@ -265,8 +283,7 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
       }
       logger.warn('unable to find MIME type of fetched media, failing this fetch');
       throw Error();
-    })(((/\?format=([a-z]+)&/.exec(url)) ?? (/.*\/.*\.([^?]+)/.exec(url)))[1])
-  );
+    });
 
   public webshot(
     tweets: Tweet[],
@@ -305,15 +322,10 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
       // invoke webshot
       if (this.mode === 0) {
         const url = `https://mobile.twitter.com/${twi.user.screen_name}/status/${twi.id_str}`;
-        const extendEntity = (cardImg: MediaEntity) => {
-          originTwi.extended_entities = {
-            ...originTwi.extended_entities,
-            media: [
-              ...originTwi.extended_entities?.media ?? [],
-              cardImg,
-            ],
-          };
-        };
+        const extendEntity = (cardMedia: MediaEntity) =>
+          (media => {
+            if (!media.some(entity => entity.id_str === cardMedia.id_str)) media.push(cardMedia);
+          })((originTwi.extended_entities ||= {}).media ||= []);
         const truncateLongThread = (atId: string) => {
           if (!atId) return;
           logger.info(`thread too long, truncating at tweet ${atId}...`);
@@ -340,28 +352,85 @@ class Webshot extends CallableInstance<[Tweet[], (...args) => void, number], Pro
             document.documentElement.scrollTop = 0;
           }).then(truncateLongThread),
 
-          // scrape card image from main tweet
-          (_, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
-            const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
-            if (typeof cardImg?.getAttribute('src') === 'string') {
-              const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg?.getAttribute('src'));
-              if (match) {
-                // tslint:disable-next-line: variable-name
-                const [media_url_https, id_str] = match.slice(1);
-                return {
+          // scrape card media from main tweet
+          (page, tweetHandle: puppeteer.ElementHandle<HTMLDivElement>) => tweetHandle.evaluate(div => {
+            const cardMedia = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img, video');
+            let match: RegExpExecArray;
+            if (cardMedia?.tagName === 'IMG' && typeof cardMedia?.getAttribute('src') === 'string') {
+              match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardMedia?.getAttribute('src'));
+            }
+            if (cardMedia?.tagName === 'VIDEO' && typeof cardMedia?.getAttribute('poster') === 'string') {
+              match = /^(.*\/amplify_video_thumb\/(\d+)\/img\/.*$)/.exec(cardMedia?.getAttribute('poster'));
+            }
+            if (match) {
+              const [media_url_https, id_str] = match.slice(1);
+              return {
+                type: cardMedia.tagName,
+                entityBase: {
                   media_url: media_url_https.replace(/^https/, 'http'),
                   media_url_https,
                   url: '',
                   display_url: '',
                   expanded_url: '',
-                  type: 'photo',
                   id: Number(id_str),
                   id_str,
                   sizes: undefined,
-                };
-              }
+                }
+              };
             }
-          }).then(cardImg => { if (cardImg) extendEntity(cardImg); })
+            return {};
+          }).then(({type, entityBase}) => {
+            if (type === 'IMG') extendEntity({
+              ...entityBase,
+              type: 'photo',
+            });
+            if (type === 'VIDEO') page.evaluate(
+              id_str => (window['__scrapedVideoUrls'] as string[])?.filter(videoUrl =>
+                new RegExp(`.*/amplify_video/${id_str}.*\\.m3u8(?:\\?|$)`).exec(videoUrl)
+              ),
+              entityBase.id_str
+            ).then(videoUrls => {
+              if (videoUrls && videoUrls.length) {
+                Promise.all(videoUrls.map(streamlistUrl =>
+                  axiosGet(streamlistUrl, 'text')
+                    .then(streamlist => M3u8.parseStreamlist(streamlist)[0])
+                    .then(({bandwidth, playlistPath, resolution}) => {
+                      const [width, height] = /(.*)x(.*)/.exec(resolution).slice(1).map(Number);
+                      const playlistUrl = new URL(playlistPath, streamlistUrl);
+                      const mediaTempFilePath = temp.path({suffix: `.mp4`});
+                      return axiosGet(playlistUrl.href, 'text')
+                        .then(playlist => M3u8.parsePlaylist(playlist))
+                        .then(({duration, segmentPaths}) =>
+                          chainPromises(segmentPaths.map(path => () =>
+                            axiosGet(new URL(path, playlistUrl).href, 'arraybuffer').then(data => {
+                              writeFileSync(mediaTempFilePath, Buffer.from(data), {flag: 'a'});
+                            })
+                          )).then(() => ({
+                            duration_millis: duration * 1000,
+                            aspect_ratio: [width, height],
+                            variants: [{
+                              bitrate: bandwidth,
+                              content_type: 'video/mp4',
+                              url: `file://${mediaTempFilePath}`,
+                            }]
+                          }) as MediaEntity['video_info'])
+                        )
+                    })
+                )).then(videoInfos =>
+                  videoInfos.reduce((vi1, vi2) => ({
+                    ...vi1,
+                    variants: vi1.variants.concat(vi2.variants)
+                  }))
+                ).then(videoInfo => extendEntity({
+                  ...entityBase,
+                  type: 'video',
+                  video_info: videoInfo,
+                })).catch(error => {
+                  logger.warn('unable to fetch scraped video, ignoring...');
+                });
+              }
+            });
+          })
         ))
           .then(fileurl => {
             if (fileurl) return Message.Image(fileurl);