Skip to content

Commit 56526a9

Browse files
authored
Merge pull request #365 from bellingcat/dev
Facebook reels fix
2 parents a9a0bae + 3a22cc2 commit 56526a9

File tree

5 files changed

+31
-17
lines changed

5 files changed

+31
-17
lines changed

poetry.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[project]
66
name = "auto-archiver"
7-
version = "1.1.5"
7+
version = "1.1.6"
88
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
99

1010
requires-python = ">=3.10,<3.13"

src/auto_archiver/modules/generic_extractor/generic_extractor.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import importlib
66
import subprocess
7+
import traceback
78
import zipfile
89

910
from typing import Generator, Type
@@ -305,7 +306,7 @@ def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str
305306
result.set_url(url)
306307

307308
if "description" in video_data and not result.get("content"):
308-
result.set_content(video_data.get("description"))
309+
result.set_content(video_data.pop("description"))
309310
# extract comments if enabled
310311
if self.comments and video_data.get("comments", None) is not None:
311312
result.set(
@@ -406,9 +407,9 @@ def get_metadata_for_video(
406407
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
407408
result.add_media(new_media)
408409
except Exception as e:
409-
logger.error(f"Error processing entry {entry}: {e}")
410+
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
410411
if not len(result.media):
411-
logger.info(f"No media found for entry {entry}, skipping.")
412+
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
412413
return False
413414

414415
return self.add_metadata(data, info_extractor, url, result)
@@ -604,9 +605,9 @@ def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
604605
validated_options
605606
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
606607

608+
result: Metadata = None
607609
for info_extractor in self.suitable_extractors(url):
608-
result = self.download_for_extractor(info_extractor, url, ydl)
609-
if result:
610-
return result
611-
612-
return False
610+
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
611+
if local_result:
612+
result = result.merge(local_result) if result else local_result
613+
return result if result else False

tests/extractors/test_antibot_extractor_enricher.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from .test_extractor_base import TestExtractorBase
66

77

8+
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
9+
10+
811
class DummySB:
912
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
1013
self._url = url
@@ -51,56 +54,67 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
5154

5255
@pytest.mark.download
5356
@pytest.mark.parametrize(
54-
"url,in_title,in_text,image_count,video_count",
57+
"url,in_title,in_text,image_count,video_count,skip_ci",
5558
[
5659
(
5760
"https://en.wikipedia.org/wiki/Western_barn_owl",
5861
"western barn owl",
5962
"Tyto alba",
60-
5,
63+
4,
6164
0,
65+
False,
6266
),
6367
(
6468
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
6569
"open sources show myanmar",
6670
"Bellingcat has geolocated",
6771
5,
6872
0,
73+
False,
6974
),
7075
(
7176
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
7277
"shot from above",
7378
"continued the work of Gazan journalists",
7479
5,
7580
1,
81+
False,
7682
),
7783
(
7884
"https://www.bellingcat.com/about/general-information",
7985
"general information",
8086
"Stichting Bellingcat",
8187
0, # SVGs are ignored
8288
0,
89+
False,
8390
),
8491
(
8592
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
8693
"Hounds of Love",
8794
"16 сентября 1985 года лейблом EMI Records.",
8895
5,
8996
0,
97+
False,
9098
),
9199
(
92100
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
93101
"TikTok",
94102
"Dito ko lang",
95103
1,
96104
0,
105+
True,
97106
),
98107
],
99108
)
100-
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
109+
def test_download_pages_with_media(
110+
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
111+
):
101112
"""
102113
Test downloading pages with media.
103114
"""
115+
if CI and skip_ci:
116+
pytest.skip("Skipping test in CI environment")
117+
104118
self.extractor = setup_module(
105119
self.extractor_module,
106120
self.config

tests/extractors/test_generic_extractor.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ def test_load_dropin(self):
4848
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
4949
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
5050
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
51-
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
52-
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
5351
],
5452
)
5553
def test_suitable_extractors(self, url, suitable_extractors):
@@ -148,6 +146,7 @@ def test_bluesky_download_no_media(self, make_item):
148146
def test_bluesky_download_video(self, make_item):
149147
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
150148
result = self.extractor.download(item)
149+
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
151150
assert result is not False
152151

153152
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")

0 commit comments

Comments
 (0)