Merge pull request #365 from bellingcat/dev

msramalho · web-flow · commit 56526a9ac7f1 · 2025-10-23T10:40:43.000+01:00
Facebook reels fix
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [project]
 name = "auto-archiver"
-version = "1.1.5"
+version = "1.1.6"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 
 requires-python = ">=3.10,<3.13"
diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -4,6 +4,7 @@
 import os
 import importlib
 import subprocess
+import traceback
 import zipfile
 
 from typing import Generator, Type
@@ -305,7 +306,7 @@ def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str
             result.set_url(url)
 
         if "description" in video_data and not result.get("content"):
-            result.set_content(video_data.get("description"))
+            result.set_content(video_data.pop("description"))
         # extract comments if enabled
         if self.comments and video_data.get("comments", None) is not None:
             result.set(
@@ -406,9 +407,9 @@ def get_metadata_for_video(
                             logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
                 result.add_media(new_media)
             except Exception as e:
-                logger.error(f"Error processing entry {entry}: {e}")
+                logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
         if not len(result.media):
-            logger.info(f"No media found for entry {entry}, skipping.")
+            logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
             return False
 
         return self.add_metadata(data, info_extractor, url, result)
@@ -604,9 +605,9 @@ def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
             validated_options
         )  # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
 
+        result: Metadata = None
         for info_extractor in self.suitable_extractors(url):
-            result = self.download_for_extractor(info_extractor, url, ydl)
-            if result:
-                return result
-
-        return False
+            local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
+            if local_result:
+                result = result.merge(local_result) if result else local_result
+        return result if result else False
diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py
@@ -5,6 +5,9 @@
 from .test_extractor_base import TestExtractorBase
 
 
+CI = os.getenv("GITHUB_ACTIONS", "") == "true"
+
+
 class DummySB:
     def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
         self._url = url
@@ -51,56 +54,67 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
 
     @pytest.mark.download
     @pytest.mark.parametrize(
-        "url,in_title,in_text,image_count,video_count",
+        "url,in_title,in_text,image_count,video_count,skip_ci",
         [
             (
                 "https://en.wikipedia.org/wiki/Western_barn_owl",
                 "western barn owl",
                 "Tyto alba",
-                5,
+                4,
                 0,
+                False,
             ),
             (
                 "https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
                 "open sources show myanmar",
                 "Bellingcat has geolocated",
                 5,
                 0,
+                False,
             ),
             (
                 "https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
                 "shot from above",
                 "continued the work of Gazan journalists",
                 5,
                 1,
+                False,
             ),
             (
                 "https://www.bellingcat.com/about/general-information",
                 "general information",
                 "Stichting Bellingcat",
                 0,  # SVGs are ignored
                 0,
+                False,
             ),
             (
                 "https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
                 "Hounds of Love",
                 "16 сентября 1985 года лейблом EMI Records.",
                 5,
                 0,
+                False,
             ),
             (
                 "https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
                 "TikTok",
                 "Dito ko lang",
                 1,
                 0,
+                True,
             ),
         ],
     )
-    def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
+    def test_download_pages_with_media(
+        self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
+    ):
         """
         Test downloading pages with media.
         """
+        if CI and skip_ci:
+            pytest.skip("Skipping test in CI environment")
+
         self.extractor = setup_module(
             self.extractor_module,
             self.config
diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py
@@ -48,8 +48,6 @@ def test_load_dropin(self):
             ("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
             ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
             ("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
-            ("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
-            ("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
         ],
     )
     def test_suitable_extractors(self, url, suitable_extractors):
@@ -148,6 +146,7 @@ def test_bluesky_download_no_media(self, make_item):
     def test_bluesky_download_video(self, make_item):
         item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
         result = self.extractor.download(item)
+        assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
         assert result is not False
 
     @pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")