Browse Source

limit captured preceding replies to 1

capture at least 1 tweet by owner, if any

auto uncollapse sensitive tweets

really fix scrolling to top of thread

handle exceptions if a tweet is hidden/deleted

fix for layout change on 21-07-28

avoid clicking invisible buttons

scrape card image from main tweet only

fix displaying wrong time unit in log
Mike L 3 years ago
parent
commit
91722d21df
3 changed files with 60 additions and 19 deletions
  1. 28 9
      dist/webshot.js
  2. 0 0
      dist/webshot.js.map
  3. 32 10
      src/webshot.ts

+ 28 - 9
dist/webshot.js

@@ -80,7 +80,7 @@ class Webshot extends CallableInstance {
                     })
                         .then(() => page.goto(url, { waitUntil: 'load', timeout: getTimeout() }))
                         .then(() => page.addStyleTag({
-                        content: 'header,#layers{display:none!important}' +
+                        content: 'header,#layers{display:none!important}article{background-color:transparent!important}' +
                             '[data-testid="caret"],[role="group"],[data-testid="tweet"]+*>[class*=" "]+div:nth-last-child(2){display:none}',
                     }))
                         .then(() => page.addStyleTag({
@@ -96,19 +96,41 @@ class Webshot extends CallableInstance {
                             });
                         }, 250);
                     }))
-                        .then(() => page.waitForSelector('article', { timeout: getTimeout() }))
+                        .then(() => page.waitForSelector('xpath=//section/*/*/div[.//article[not(.//time)]]', { timeout: getTimeout() }))
+                        .then(handle => handle.$$('xpath=..//a[contains(@href,"content_you_see")]/../../..//*[@role="button"]')
+                        .then(sensitiveToggles => {
+                        const count = sensitiveToggles.length;
+                        if (count)
+                            logger.info(`found ${count} sensitive ${count === 1 ? 'tweet' : 'tweets'} on page, uncollapsing...`);
+                        return utils_1.chainPromises(sensitiveToggles.filter(toggle => toggle.isVisible()).map(toggle => () => toggle.click()));
+                    })
+                        .then(() => handle))
                         .catch((err) => {
                         if (err.name !== 'TimeoutError')
                             throw err;
-                        logger.warn(`navigation timed out at ${getTimerTime()} seconds`);
+                        logger.warn(`navigation timed out at ${getTimerTime()} ms`);
                         return null;
                     })
-                        .then(handle => {
+                        .then((handle) => {
                         if (handle === null)
                             throw new puppeteer.errors.TimeoutError();
+                        return handle.evaluate(div => {
+                            try {
+                                const selector = '[data-testid="tweet"]>:nth-child(2)>:first-child a';
+                                const getProfileUrl = () => (div.querySelector(selector) || { href: '' }).href;
+                                const ownerProfileUrl = getProfileUrl();
+                                while (div = div.previousElementSibling) {
+                                    if (getProfileUrl() !== ownerProfileUrl)
+                                        continue;
+                                    return document.documentElement.scrollTop = window.scrollY + div.getBoundingClientRect().top;
+                                }
+                            }
+                            catch (_a) { }
+                            document.documentElement.scrollTop = 0;
+                        }).then(() => handle);
                     })
-                        .then(() => page.evaluate(() => {
-                        const cardImg = document.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
+                        .then(handle => handle.evaluate(div => {
+                        const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
                         if (typeof (cardImg === null || cardImg === void 0 ? void 0 : cardImg.getAttribute('src')) === 'string') {
                             const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg === null || cardImg === void 0 ? void 0 : cardImg.getAttribute('src'));
                             if (match) {
@@ -131,9 +153,6 @@ class Webshot extends CallableInstance {
                         if (cardImg)
                             this.extendEntity(cardImg);
                     })
-                        .then(() => page.addScriptTag({
-                        content: 'document.documentElement.scrollTop=0;',
-                    }))
                         .then(() => utils_1.chainPromises(morePostProcessings.map(func => () => func(page))))
                         .then(() => util_1.promisify(setTimeout)(getTimeout()))
                         .then(() => page.screenshot())

File diff suppressed because it is too large
+ 0 - 0
dist/webshot.js.map


+ 32 - 10
src/webshot.ts

@@ -106,7 +106,7 @@ class Webshot extends CallableInstance<[Tweets, (...args) => void, number], Prom
             .then(() => page.goto(url, {waitUntil: 'load', timeout: getTimeout()}))
             // hide header, "more options" button, like and retweet count
             .then(() => page.addStyleTag({
-              content: 'header,#layers{display:none!important}' +
+              content: 'header,#layers{display:none!important}article{background-color:transparent!important}' +
                 '[data-testid="caret"],[role="group"],[data-testid="tweet"]+*>[class*=" "]+div:nth-last-child(2){display:none}',
             }))
             .then(() => page.addStyleTag({
@@ -123,17 +123,42 @@ class Webshot extends CallableInstance<[Tweets, (...args) => void, number], Prom
                 });
               }, 250);
             }))
-            .then(() => page.waitForSelector('article', {timeout: getTimeout()}))
-            .catch((err: Error): Promise<puppeteer.ElementHandle<Element> | null> => {
+            // find main tweet
+            .then(() => page.waitForSelector('xpath=//section/*/*/div[.//article[not(.//time)]]', {timeout: getTimeout()}))
+            // toggle visibility of sensitive tweets
+            .then(handle => handle.$$('xpath=..//a[contains(@href,"content_you_see")]/../../..//*[@role="button"]')
+              .then(sensitiveToggles => {
+                const count = sensitiveToggles.length;
+                if (count) logger.info(`found ${count} sensitive ${count === 1 ? 'tweet' : 'tweets'} on page, uncollapsing...`);
+                return chainPromises(sensitiveToggles.filter(toggle => toggle.isVisible()).map(toggle => () => toggle.click()));
+              })
+              .then(() => handle)
+            )
+            .catch((err: Error): Promise<puppeteer.ElementHandle<HTMLDivElement> | null> => {
               if (err.name !== 'TimeoutError') throw err;
-              logger.warn(`navigation timed out at ${getTimerTime()} seconds`);
+              logger.warn(`navigation timed out at ${getTimerTime()} ms`);
               return null;
             })
-            .then(handle => {
+            // scroll to last tweet by owner in thread, if any, or top of thread
+            .then((handle: puppeteer.ElementHandle<HTMLDivElement>) => {
               if (handle === null) throw new puppeteer.errors.TimeoutError();
+              return handle.evaluate(div => {
+                try {
+                  const selector = '[data-testid="tweet"]>:nth-child(2)>:first-child a';
+                  const getProfileUrl = () => (div.querySelector<HTMLAnchorElement>(selector) || {href: ''}).href;
+                  const ownerProfileUrl = getProfileUrl();
+                  // eslint-disable-next-line no-cond-assign
+                  while (div = div.previousElementSibling as HTMLDivElement) {
+                    if (getProfileUrl() !== ownerProfileUrl) continue;
+                    return document.documentElement.scrollTop = window.scrollY + div.getBoundingClientRect().top;
+                  }
+                } catch {/* handle errors like none-found cases */}
+                document.documentElement.scrollTop = 0;
+              }).then(() => handle);
             })
-            .then(() => page.evaluate(() => {
-              const cardImg = document.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
+            // scrape card image from main tweet
+            .then(handle => handle.evaluate(div => {
+              const cardImg = div.querySelector('div[data-testid^="card.layout"][data-testid$=".media"] img');
               if (typeof cardImg?.getAttribute('src') === 'string') {
                 const match = /^(.*\/card_img\/(\d+)\/.+\?format=.*)&name=/.exec(cardImg?.getAttribute('src'));
                 if (match) {
@@ -156,9 +181,6 @@ class Webshot extends CallableInstance<[Tweets, (...args) => void, number], Prom
             .then(cardImg => {
               if (cardImg) this.extendEntity(cardImg); 
             })
-            .then(() => page.addScriptTag({
-              content: 'document.documentElement.scrollTop=0;',
-            }))
             .then(() => chainPromises(morePostProcessings.map(func => () => func(page))))
             .then(() => promisify(setTimeout)(getTimeout()))
             .then(() => page.screenshot())

Some files were not shown because too many files changed in this diff