Skip to content

Commit 80d61e8

Browse files
authored
Merge pull request #341 from bellingcat/dev
Address several small bugs, includes tiktok photos extraction, and data-saving for proxy usage in generic_extractor.
2 parents 0f56a5a + d36cdbf commit 80d61e8

File tree

12 files changed

+298
-153
lines changed

12 files changed

+298
-153
lines changed

docs/source/modules/extractor.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ Extractor modules are used to extract the content of a given URL. Typically, one
44

55
Extractors that are able to extract content from a wide range of websites include:
66
1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
7-
2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link.
8-
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
7+
2. Antibot Extractor: uses a headless browser to bypass bot detection and extract content.
8+
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
9+
4. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the archived link.
910

1011
```{include} autogen/extractor.md
1112
```

poetry.lock

Lines changed: 152 additions & 127 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[project]
66
name = "auto-archiver"
7-
version = "1.1.1"
7+
version = "1.1.2"
88
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
99

1010
requires-python = ">=3.10,<3.13"
@@ -58,6 +58,7 @@ dependencies = [
5858
"secretstorage (>=3.3.3,<4.0.0)",
5959
"seleniumbase (>=4.36.4,<5.0.0)",
6060
"pyautogui (>=0.9.54,<0.10.0)",
61+
"pyperclip (==1.8.2)",
6162
]
6263

6364
[tool.poetry.group.dev.dependencies]

src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ def _prepare_user_data_dir(self):
8181
os.makedirs(self.user_data_dir, exist_ok=True)
8282

8383
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
84+
if to_enrich.get_media_by_id("html_source_code"):
85+
logger.info("Antibot has already been executed, skipping.")
86+
return True
8487
using_user_data_dir = self.user_data_dir if custom_data_dir else None
8588
url = to_enrich.get_url()
8689

@@ -96,7 +99,7 @@ def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
9699
dropin = self._get_suitable_dropin(url, sb)
97100
dropin.open_page(url)
98101

99-
if self.detect_auth_wall and self._hit_auth_wall(sb):
102+
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
100103
logger.warning("Skipping since auth wall or CAPTCHA was detected")
101104
return False
102105

@@ -274,8 +277,14 @@ def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: s
274277
return
275278
url = to_enrich.get_url()
276279
all_urls = set()
280+
logger.debug(f"Extracting media for {js_css_selector=}")
281+
282+
try:
283+
sources = sb.execute_script(js_css_selector)
284+
except selenium.common.exceptions.JavascriptException as e:
285+
logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
286+
return
277287

278-
sources = sb.execute_script(js_css_selector)
279288
# js_for_css_selectors
280289
for src in sources:
281290
if len(all_urls) >= max_media:

src/auto_archiver/modules/antibot_extractor_enricher/dropin.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
import traceback
34
from typing import Mapping
@@ -74,8 +75,11 @@ def js_for_image_css_selectors(self) -> str:
7475
7576
You can overwrite this instead of `images_selector` for more control over scraped images.
7677
"""
78+
if not self.images_selectors():
79+
return "return [];"
80+
safe_selector = json.dumps(self.images_selectors())
7781
return f"""
78-
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
82+
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
7983
"""
8084

8185
def js_for_video_css_selectors(self) -> str:
@@ -84,8 +88,11 @@ def js_for_video_css_selectors(self) -> str:
8488
8589
You can overwrite this instead of `video_selector` for more control over scraped videos.
8690
"""
91+
if not self.video_selectors():
92+
return "return [];"
93+
safe_selector = json.dumps(self.video_selectors())
8794
return f"""
88-
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
95+
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
8996
"""
9097

9198
def open_page(self, url) -> bool:
@@ -103,6 +110,12 @@ def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
103110
"""
104111
return 0, 0
105112

113+
def hit_auth_wall(self) -> bool:
114+
"""
115+
Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
116+
"""
117+
return True
118+
106119
def _get_username_password(self, site) -> tuple[str, str]:
107120
"""
108121
Get the username and password for the site from the extractor's auth data.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from contextlib import suppress
2+
from typing import Mapping
3+
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
4+
5+
6+
class TikTokDropin(Dropin):
7+
"""
8+
A class to handle TikTok drop-in functionality for the antibot extractor enricher module.
9+
"""
10+
11+
def documentation() -> Mapping[str, str]:
12+
return {
13+
"name": "TikTok Dropin",
14+
"description": "Handles TikTok posts and works without authentication.",
15+
"site": "tiktok.com",
16+
}
17+
18+
@staticmethod
19+
def suitable(url: str) -> bool:
20+
return "tiktok.com" in url
21+
22+
@staticmethod
23+
def images_selectors() -> str:
24+
return '[data-e2e="detail-photo"] img'
25+
26+
@staticmethod
27+
def video_selectors() -> str:
28+
return None # TikTok videos should be handled by the generic extractor
29+
30+
def open_page(self, url) -> bool:
31+
self.sb.wait_for_ready_state_complete()
32+
self._close_cookies_banner()
33+
# TODO: implement login logic
34+
if url != self.sb.get_current_url():
35+
return False
36+
return True
37+
38+
def hit_auth_wall(self) -> bool:
39+
return False # TikTok does not require authentication for public posts
40+
41+
def _close_cookies_banner(self):
42+
with suppress(Exception): # selenium.common.exceptions.JavascriptException
43+
self.sb.execute_script("""
44+
document
45+
.querySelector("tiktok-cookie-banner")
46+
.shadowRoot.querySelector("faceplate-dialog")
47+
.querySelector("button")
48+
.click()
49+
""")
50+
self.sb.click_if_visible("Skip")

src/auto_archiver/modules/generic_extractor/__manifest__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@
6060
"default": "",
6161
"help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
6262
},
63+
"proxy_on_failure_only": {
64+
"default": True,
65+
"help": "Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.",
66+
},
6367
"end_means_success": {
6468
"default": True,
6569
"help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",

src/auto_archiver/modules/generic_extractor/generic_extractor.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,9 @@ def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
502502
try:
503503
result = self.get_metadata_for_post(info_extractor, url, ydl)
504504
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
505+
if "NSFW tweet requires authentication." in str(post_e):
506+
logger.warning(str(post_e))
507+
return False
505508
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
506509
return False
507510
except Exception as generic_e:
@@ -525,13 +528,24 @@ def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
525528

526529
return result
527530

528-
def download(self, item: Metadata) -> Metadata:
531+
def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
529532
url = item.get_url()
530533

531534
# TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
532535
if url.startswith("https://ya.ru"):
533536
url = url.replace("https://ya.ru", "https://yandex.ru")
534537
item.set("replaced_url", url)
538+
logger.debug(f"{skip_proxy=}, {self.proxy_on_failure_only=}, {self.proxy=}")
539+
540+
# proxy_on_failure_only logic
541+
if self.proxy and self.proxy_on_failure_only and not skip_proxy:
542+
# when proxy_on_failure_only is True, we first try to download without a proxy and only continue with execution if that fails
543+
try:
544+
if without_proxy := self.download(item, skip_proxy=True):
545+
logger.info("Downloaded successfully without proxy.")
546+
return without_proxy
547+
except Exception:
548+
logger.debug("Download without proxy failed, trying with proxy...")
535549

536550
ydl_options = [
537551
"-o",
@@ -546,7 +560,7 @@ def download(self, item: Metadata) -> Metadata:
546560
]
547561

548562
# proxy handling
549-
if self.proxy:
563+
if self.proxy and not skip_proxy:
550564
ydl_options.extend(["--proxy", self.proxy])
551565

552566
# max_downloads handling

src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,26 +32,37 @@ def setup(self) -> None:
3232
if not self.sheet and not self.sheet_id:
3333
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
3434

35-
def open_sheet(self):
35+
@retry(
36+
wait_exponential_multiplier=1,
37+
stop_max_attempt_number=6,
38+
)
39+
def open_sheet(self) -> gspread.Spreadsheet:
3640
if self.sheet:
3741
return self.gsheets_client.open(self.sheet)
3842
else:
3943
return self.gsheets_client.open_by_key(self.sheet_id)
4044

45+
@retry(
46+
wait_exponential_multiplier=1,
47+
stop_max_attempt_number=6,
48+
)
49+
def enumerate_sheets(self, sheet) -> Iterator[gspread.Worksheet]:
50+
for worksheet in sheet.worksheets():
51+
yield worksheet
52+
4153
def __iter__(self) -> Iterator[Metadata]:
42-
sh = self.open_sheet()
43-
for ii, worksheet in enumerate(sh.worksheets()):
44-
if not self.should_process_sheet(worksheet.title):
45-
logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules")
46-
continue
47-
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
48-
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
49-
if len(missing_cols := self.missing_required_columns(gw)):
50-
logger.debug(
51-
f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
52-
)
53-
continue
54-
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
54+
spreadsheet = self.open_sheet()
55+
for worksheet in self.enumerate_sheets(spreadsheet):
56+
with logger.contextualize(worksheet=f"{spreadsheet.title}:{worksheet.title}"):
57+
if not self.should_process_sheet(worksheet.title):
58+
logger.debug("Skipped worksheet due to allow/block rules")
59+
continue
60+
logger.info(f"Opening worksheet header={self.header}")
61+
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
62+
if len(missing_cols := self.missing_required_columns(gw)):
63+
logger.debug(f"Skipped worksheet due to missing required column(s) for {missing_cols}")
64+
continue
65+
5566
# process and yield metadata here:
5667
yield from self._process_rows(gw)
5768
logger.info(f"Finished worksheet {worksheet.title}")

src/auto_archiver/modules/timestamping_enricher/__manifest__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# "http://tsa.sinpe.fi.cr/tsaHttp/", # self-signed
2121
# "http://tsa.cra.ge/signserver/tsa?workerName=qtsa", # self-signed
2222
"http://tss.cnbs.gob.hn/TSS/HttpTspServer",
23-
"http://dss.nowina.lu/pki-factory/tsa/good-tsa",
23+
# "http://dss.nowina.lu/pki-factory/tsa/good-tsa",
2424
# "https://freetsa.org/tsr", # self-signed
2525
],
2626
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",

0 commit comments

Comments
 (0)