Merge pull request #341 from bellingcat/dev

msramalho · web-flow · commit 80d61e8b85e3 · 2025-07-05T20:28:00.000+01:00
Address several small bugs, includes tiktok photos extraction, and data-saving for proxy usage in generic_extractor.
diff --git a/docs/source/modules/extractor.md b/docs/source/modules/extractor.md
@@ -4,8 +4,9 @@ Extractor modules are used to extract the content of a given URL. Typically, one
 
 Extractors that are able to extract content from a wide range of websites include:
 1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
-2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link.
-3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format. 
+2. Antibot Extractor: uses a headless browser to bypass bot detection and extract content.
+3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
+4. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the archived link.
 
 ```{include} autogen/extractor.md
 ```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [project]
 name = "auto-archiver"
-version = "1.1.1"
+version = "1.1.2"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 
 requires-python = ">=3.10,<3.13"
@@ -58,6 +58,7 @@ dependencies = [
     "secretstorage (>=3.3.3,<4.0.0)",
     "seleniumbase (>=4.36.4,<5.0.0)",
     "pyautogui (>=0.9.54,<0.10.0)",
+    "pyperclip (==1.8.2)", 
 ]
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -81,6 +81,9 @@ def _prepare_user_data_dir(self):
             os.makedirs(self.user_data_dir, exist_ok=True)
 
     def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
+        if to_enrich.get_media_by_id("html_source_code"):
+            logger.info("Antibot has already been executed, skipping.")
+            return True
         using_user_data_dir = self.user_data_dir if custom_data_dir else None
         url = to_enrich.get_url()
 
@@ -96,7 +99,7 @@ def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
                 dropin = self._get_suitable_dropin(url, sb)
                 dropin.open_page(url)
 
-                if self.detect_auth_wall and self._hit_auth_wall(sb):
+                if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
                     logger.warning("Skipping since auth wall or CAPTCHA was detected")
                     return False
 
@@ -274,8 +277,14 @@ def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: s
             return
         url = to_enrich.get_url()
         all_urls = set()
+        logger.debug(f"Extracting media for {js_css_selector=}")
+
+        try:
+            sources = sb.execute_script(js_css_selector)
+        except selenium.common.exceptions.JavascriptException as e:
+            logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
+            return
 
-        sources = sb.execute_script(js_css_selector)
         # js_for_css_selectors
         for src in sources:
             if len(all_urls) >= max_media:
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -1,3 +1,4 @@
+import json
 import os
 import traceback
 from typing import Mapping
@@ -74,8 +75,11 @@ def js_for_image_css_selectors(self) -> str:
 
         You can overwrite this instead of `images_selector` for more control over scraped images.
         """
+        if not self.images_selectors():
+            return "return [];"
+        safe_selector = json.dumps(self.images_selectors())
         return f"""
-            return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+            return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
         """
 
     def js_for_video_css_selectors(self) -> str:
@@ -84,8 +88,11 @@ def js_for_video_css_selectors(self) -> str:
 
         You can overwrite this instead of `video_selector` for more control over scraped videos.
         """
+        if not self.video_selectors():
+            return "return [];"
+        safe_selector = json.dumps(self.video_selectors())
         return f"""
-            return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
+            return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
         """
 
     def open_page(self, url) -> bool:
@@ -103,6 +110,12 @@ def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
         """
         return 0, 0
 
+    def hit_auth_wall(self) -> bool:
+        """
+        Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
+        """
+        return True
+
     def _get_username_password(self, site) -> tuple[str, str]:
         """
         Get the username and password for the site from the extractor's auth data.
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/tiktok.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/tiktok.py
@@ -0,0 +1,50 @@
+from contextlib import suppress
+from typing import Mapping
+from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
+
+
+class TikTokDropin(Dropin):
+    """
+    A class to handle TikTok drop-in functionality for the antibot extractor enricher module.
+    """
+
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "TikTok Dropin",
+            "description": "Handles TikTok posts and works without authentication.",
+            "site": "tiktok.com",
+        }
+
+    @staticmethod
+    def suitable(url: str) -> bool:
+        return "tiktok.com" in url
+
+    @staticmethod
+    def images_selectors() -> str:
+        return '[data-e2e="detail-photo"] img'
+
+    @staticmethod
+    def video_selectors() -> str:
+        return None  # TikTok videos should be handled by the generic extractor
+
+    def open_page(self, url) -> bool:
+        self.sb.wait_for_ready_state_complete()
+        self._close_cookies_banner()
+        # TODO: implement login logic
+        if url != self.sb.get_current_url():
+            return False
+        return True
+
+    def hit_auth_wall(self) -> bool:
+        return False  # TikTok does not require authentication for public posts
+
+    def _close_cookies_banner(self):
+        with suppress(Exception):  # selenium.common.exceptions.JavascriptException
+            self.sb.execute_script("""
+                document
+                    .querySelector("tiktok-cookie-banner")
+                    .shadowRoot.querySelector("faceplate-dialog")
+                    .querySelector("button")
+                    .click()
+            """)
+        self.sb.click_if_visible("Skip")
diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py
@@ -60,6 +60,10 @@
             "default": "",
             "help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
         },
+        "proxy_on_failure_only": {
+            "default": True,
+            "help": "Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.",
+        },
         "end_means_success": {
             "default": True,
             "help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",
diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -502,6 +502,9 @@ def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
             try:
                 result = self.get_metadata_for_post(info_extractor, url, ydl)
             except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
+                if "NSFW tweet requires authentication." in str(post_e):
+                    logger.warning(str(post_e))
+                    return False
                 logger.error("Error downloading metadata for post: {error}", error=str(post_e))
                 return False
             except Exception as generic_e:
@@ -525,13 +528,24 @@ def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
 
         return result
 
-    def download(self, item: Metadata) -> Metadata:
+    def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
         url = item.get_url()
 
         # TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
         if url.startswith("https://ya.ru"):
             url = url.replace("https://ya.ru", "https://yandex.ru")
             item.set("replaced_url", url)
+        logger.debug(f"{skip_proxy=}, {self.proxy_on_failure_only=}, {self.proxy=}")
+
+        # proxy_on_failure_only logic
+        if self.proxy and self.proxy_on_failure_only and not skip_proxy:
+            # when proxy_on_failure_only is True, we first try to download without a proxy and only continue with execution if that fails
+            try:
+                if without_proxy := self.download(item, skip_proxy=True):
+                    logger.info("Downloaded successfully without proxy.")
+                    return without_proxy
+            except Exception:
+                logger.debug("Download without proxy failed, trying with proxy...")
 
         ydl_options = [
             "-o",
@@ -546,7 +560,7 @@ def download(self, item: Metadata) -> Metadata:
         ]
 
         # proxy handling
-        if self.proxy:
+        if self.proxy and not skip_proxy:
             ydl_options.extend(["--proxy", self.proxy])
 
         # max_downloads handling
diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
@@ -32,26 +32,37 @@ def setup(self) -> None:
         if not self.sheet and not self.sheet_id:
             raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
 
-    def open_sheet(self):
+    @retry(
+        wait_exponential_multiplier=1,
+        stop_max_attempt_number=6,
+    )
+    def open_sheet(self) -> gspread.Spreadsheet:
         if self.sheet:
             return self.gsheets_client.open(self.sheet)
         else:
             return self.gsheets_client.open_by_key(self.sheet_id)
 
+    @retry(
+        wait_exponential_multiplier=1,
+        stop_max_attempt_number=6,
+    )
+    def enumerate_sheets(self, sheet) -> Iterator[gspread.Worksheet]:
+        for worksheet in sheet.worksheets():
+            yield worksheet
+
     def __iter__(self) -> Iterator[Metadata]:
-        sh = self.open_sheet()
-        for ii, worksheet in enumerate(sh.worksheets()):
-            if not self.should_process_sheet(worksheet.title):
-                logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules")
-                continue
-            logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
-            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
-            if len(missing_cols := self.missing_required_columns(gw)):
-                logger.debug(
-                    f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
-                )
-                continue
-            with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
+        spreadsheet = self.open_sheet()
+        for worksheet in self.enumerate_sheets(spreadsheet):
+            with logger.contextualize(worksheet=f"{spreadsheet.title}:{worksheet.title}"):
+                if not self.should_process_sheet(worksheet.title):
+                    logger.debug("Skipped worksheet due to allow/block rules")
+                    continue
+                logger.info(f"Opening worksheet header={self.header}")
+                gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
+                if len(missing_cols := self.missing_required_columns(gw)):
+                    logger.debug(f"Skipped worksheet due to missing required column(s) for {missing_cols}")
+                    continue
+
                 # process and yield metadata here:
                 yield from self._process_rows(gw)
             logger.info(f"Finished worksheet {worksheet.title}")
diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
@@ -20,7 +20,7 @@
                     # "http://tsa.sinpe.fi.cr/tsaHttp/", # self-signed
                     # "http://tsa.cra.ge/signserver/tsa?workerName=qtsa", # self-signed
                     "http://tss.cnbs.gob.hn/TSS/HttpTspServer",
-                    "http://dss.nowina.lu/pki-factory/tsa/good-tsa",
+                    # "http://dss.nowina.lu/pki-factory/tsa/good-tsa",
                     # "https://freetsa.org/tsr", # self-signed
                 ],
             "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
diff --git a/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py b/src/auto_archiver/modules/wayback_extractor_enricher/wayback_extractor_enricher.py
@@ -2,7 +2,7 @@
 from auto_archiver.utils.custom_logger import logger
 import time
 import requests
-
+from urllib3.exceptions import MaxRetryError
 from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import url as UrlUtil
 from auto_archiver.core import Metadata
@@ -45,7 +45,14 @@ def enrich(self, to_enrich: Metadata) -> bool:
         if self.if_not_archived_within:
             post_data["if_not_archived_within"] = self.if_not_archived_within
         # see https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA for more options
-        r = requests.post("https://web.archive.org/save/", headers=ia_headers, data=post_data, proxies=proxies)
+        try:
+            r = requests.post("https://web.archive.org/save/", headers=ia_headers, data=post_data, proxies=proxies)
+        except MaxRetryError as e:
+            logger.warning(
+                f"MaxRetryError during Wayback POST call to /save, this may be do to a high number of calls leading to rate limiting: {e}"
+            )
+            to_enrich.set("wayback", "failed: possible rate limit")
+            return False
 
         if r.status_code != 200:
             logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
@@ -76,6 +83,9 @@ def enrich(self, to_enrich: Metadata) -> bool:
                 if r_status.status_code == 200 and r_json["status"] == "success":
                     wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
                 elif r_status.status_code != 200 or r_json["status"] != "pending":
+                    if r_json.get("status_ext") in ["error:blocked-url", "error:unauthorized"]:
+                        logger.warning("Wayback cannot currently archive the URL, skipping.")
+                        to_enrich.set("wayback", r_json.get("status_ext"))
                     logger.error(f"Wayback failed with {r_json}")
                     return False
             except requests.exceptions.RequestException as e:
diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py
@@ -88,6 +88,13 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
                 5,
                 0,
             ),
+            (
+                "https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
+                "TikTok",
+                "Dito ko lang",
+                1,
+                0,
+            ),
         ],
     )
     def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):