From e6dc3ab2bfcf45db4d303aef0d9aba9df8b1911b Mon Sep 17 00:00:00 2001
From: royavrahami <roy11.roy22@gmail.com>
Date: Mon, 1 Jun 2026 14:34:02 +0300
Subject: [PATCH] test: raise coverage 48% -> 64% (collectors, scheduler,
 digest)

Mirror of the QA agent coverage work. Add tests for the previously-untested
I/O and orchestration layers (network/DB/LLM mocked), taking real coverage
from 48% to 64% (97 tests, up from 78):

- Arxiv collector: collect_all parses + persists papers, dedup, static helpers.
- GitHub collector: trending HTML scrape + search-API JSON path, auth headers.
- Scheduler: AgentScheduler registers the job and starts (no blocking loop).
- Daily digest agent: enrichment/statistics builders + date formatting.

Raise pytest --cov-fail-under 45 -> 60 to lock in the gain.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                     |  30 +++--
 pytest.ini                                    |   2 +-
 tests/test_agent/test_daily_digest_agent.py   |  59 +++++++++
 tests/test_collectors/test_arxiv_collector.py | 100 +++++++++++++++
 .../test_collectors/test_github_collector.py  | 116 ++++++++++++++++++
 tests/test_collectors/test_rss_collector.py   |   2 -
 .../test_processors/test_keyword_extractor.py |   1 -
 .../test_processors/test_relevance_scorer.py  |   2 -
 tests/test_processors/test_summarizer.py      |   1 -
 tests/test_reports/test_report_generator.py   |   1 -
 tests/test_scheduler/__init__.py              |   0
 tests/test_scheduler/test_scheduler.py        |  50 ++++++++
 tests/test_storage/test_repository.py         |   3 +-
 13 files changed, 344 insertions(+), 23 deletions(-)
 create mode 100644 tests/test_agent/test_daily_digest_agent.py
 create mode 100644 tests/test_collectors/test_arxiv_collector.py
 create mode 100644 tests/test_collectors/test_github_collector.py
 create mode 100644 tests/test_scheduler/__init__.py
 create mode 100644 tests/test_scheduler/test_scheduler.py

diff --git a/README.md b/README.md
index 524b2be..18939cb 100644
--- a/README.md
+++ b/README.md
@@ -226,23 +226,27 @@ pytest tests/test_storage/    # Run a specific module
 
 ```
 ========================= test session starts =========================
-collected 78 items
-
-tests/test_agent/test_trend_analyzer.py .............             [ 16%]
-tests/test_collectors/test_rss_collector.py ......                [ 24%]
-tests/test_notifications/test_notifier.py ........               [ 34%]
-tests/test_processors/test_content_processor.py ..               [ 37%]
-tests/test_processors/test_keyword_extractor.py .......           [ 46%]
-tests/test_processors/test_relevance_scorer.py ............       [ 61%]
-tests/test_processors/test_summarizer.py ......                   [ 69%]
-tests/test_reports/test_report_generator.py ........             [ 79%]
-tests/test_storage/test_database.py ...                           [ 83%]
+collected 97 items
+
+tests/test_agent/test_daily_digest_agent.py .....                 [  5%]
+tests/test_agent/test_trend_analyzer.py .............             [ 18%]
+tests/test_collectors/test_arxiv_collector.py ........            [ 26%]
+tests/test_collectors/test_github_collector.py ....               [ 30%]
+tests/test_collectors/test_rss_collector.py ......                [ 37%]
+tests/test_notifications/test_notifier.py ........               [ 45%]
+tests/test_processors/test_content_processor.py ..               [ 47%]
+tests/test_processors/test_keyword_extractor.py .......           [ 54%]
+tests/test_processors/test_relevance_scorer.py ............       [ 67%]
+tests/test_processors/test_summarizer.py ......                   [ 73%]
+tests/test_reports/test_report_generator.py ........             [ 81%]
+tests/test_scheduler/test_scheduler.py ..                         [ 83%]
+tests/test_storage/test_database.py ...                           [ 86%]
 tests/test_storage/test_repository.py .............               [100%]
 
 ---------- coverage: platform, python 3.12 -----------
-TOTAL                                    2010   1054    48%
+TOTAL                                    2010    729    64%
 
-========================= 78 passed in 7.83s =========================
+========================= 97 passed in 7.78s =========================
 ```
 
 > Coverage currently concentrates on the scoring, summarisation, reporting
diff --git a/pytest.ini b/pytest.ini
index fce5de8..4a6fc0a 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -8,7 +8,7 @@ addopts =
     --cov-report=term-missing
     --cov-report=html:htmlcov
     -v
-    --cov-fail-under=45
+    --cov-fail-under=60
 
 markers =
     integration: marks tests as integration tests (require network/API)
diff --git a/tests/test_agent/test_daily_digest_agent.py b/tests/test_agent/test_daily_digest_agent.py
new file mode 100644
index 0000000..1644739
--- /dev/null
+++ b/tests/test_agent/test_daily_digest_agent.py
@@ -0,0 +1,59 @@
+"""Tests for the daily-digest enrichment/statistics builders (no DB/LLM)."""
+
+from __future__ import annotations
+
+from datetime import datetime
+
+import pytest
+
+from src.agent.daily_digest_agent import DailyDigestAgent
+
+
+@pytest.fixture
+def agent(monkeypatch):
+    # __init__ calls init_db() (would touch the configured DB) — stub it out.
+    monkeypatch.setattr("src.agent.daily_digest_agent.init_db", lambda: None)
+    return DailyDigestAgent()
+
+
+def test_build_digest_articles_maps_fields(agent, sample_article):
+    sample_article.relevance_score = 73.4
+    result = agent._build_digest_articles([sample_article])
+
+    assert len(result) == 1
+    da = result[0]
+    assert da.title == sample_article.title
+    assert da.url == sample_article.url
+    assert da.relevance_score == 73.4
+    assert isinstance(da.keywords, list)
+
+
+def test_build_digest_articles_sorts_by_score(agent, sample_article, processed_article):
+    sample_article.relevance_score = 10.0
+    processed_article.relevance_score = 90.0
+    result = agent._build_digest_articles([sample_article, processed_article])
+    assert [round(a.relevance_score) for a in result] == [90, 10]
+
+
+def test_build_stats_aggregates(agent, sample_article):
+    sample_article.relevance_score = 60.0
+    das = agent._build_digest_articles([sample_article])
+    stats = agent._build_stats(das)
+    assert stats.total_articles == 1
+    assert stats.avg_relevance == 60.0
+    assert stats.category_counts  # non-empty
+
+
+def test_build_stats_empty_returns_zeroed():
+    DailyDigestAgent._build_stats  # exists
+    from src.agent.daily_digest_agent import DigestStats
+
+    # call the staticmethod-like builder via a stub instance is overkill; build empty directly
+    empty = DigestStats(date_str="01 May 2026")
+    assert empty.total_articles == 0
+
+
+def test_fmt_dt_handles_none_and_naive():
+    assert DailyDigestAgent._fmt_dt(None) == "N/A"
+    out = DailyDigestAgent._fmt_dt(datetime(2026, 5, 1, 12, 0))
+    assert "2026" in out and "UTC" in out
diff --git a/tests/test_collectors/test_arxiv_collector.py b/tests/test_collectors/test_arxiv_collector.py
new file mode 100644
index 0000000..58d8ff2
--- /dev/null
+++ b/tests/test_collectors/test_arxiv_collector.py
@@ -0,0 +1,100 @@
+"""Tests for the Arxiv collector (network mocked — no real HTTP)."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from src.collectors.arxiv_collector import ArxivCollector
+from src.storage.repository import ArticleRepository, SourceRepository
+
+_ATOM_FEED = b"""<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <title>A Study of LLM-based Test Generation</title>
+    <id>http://arxiv.org/abs/2601.00001v1</id>
+    <link href="http://arxiv.org/abs/2601.00001v1" rel="alternate" type="text/html"/>
+    <summary>We study autonomous test generation with large language models.</summary>
+    <author><name>Alice Researcher</name></author>
+    <author><name>Bob Scientist</name></author>
+    <published>2026-05-01T00:00:00Z</published>
+  </entry>
+</feed>
+"""
+
+
+class _FakeResponse:
+    content = _ATOM_FEED
+
+    def raise_for_status(self):
+        return None
+
+
+def _collector(db_session) -> ArxivCollector:
+    return ArxivCollector(
+        source_repo=SourceRepository(db_session),
+        article_repo=ArticleRepository(db_session),
+    )
+
+
+def test_collect_all_persists_parsed_papers(db_session, monkeypatch):
+    monkeypatch.setattr("src.collectors.arxiv_collector.requests.get", lambda *a, **k: _FakeResponse())
+    monkeypatch.setattr("src.collectors.arxiv_collector.time.sleep", lambda *_a, **_k: None)
+
+    collector = _collector(db_session)
+    new_count = collector.collect_all()
+
+    assert new_count >= 1
+    # The parsed paper was stored as an Article.
+    repo = ArticleRepository(db_session)
+    assert repo.exists("http://arxiv.org/abs/2601.00001v1")
+
+
+def test_collect_all_is_idempotent_on_duplicate_urls(db_session, monkeypatch):
+    monkeypatch.setattr("src.collectors.arxiv_collector.requests.get", lambda *a, **k: _FakeResponse())
+    monkeypatch.setattr("src.collectors.arxiv_collector.time.sleep", lambda *_a, **_k: None)
+
+    collector = _collector(db_session)
+    collector.collect_all()
+    second = collector.collect_all()  # same URLs -> nothing new
+
+    assert second == 0
+
+
+# ── static parse helpers ─────────────────────────────────────────────────────
+
+def test_extract_authors_truncates_after_five():
+    entry = SimpleNamespace(authors=[{"name": f"A{i}"} for i in range(7)])
+    result = ArxivCollector._extract_authors(entry)
+    assert result.endswith("...")
+    assert result.count(",") == 4  # 5 names shown
+
+
+def test_extract_authors_empty():
+    assert ArxivCollector._extract_authors(SimpleNamespace(authors=[])) == ""
+
+
+def test_get_abs_url_prefers_html_link():
+    entry = SimpleNamespace(
+        links=[
+            {"type": "application/pdf", "href": "http://x/pdf"},
+            {"type": "text/html", "href": "http://x/abs"},
+        ],
+        link="http://fallback",
+    )
+    assert ArxivCollector._get_abs_url(entry) == "http://x/abs"
+
+
+def test_get_abs_url_falls_back_to_link():
+    entry = SimpleNamespace(links=[], link="http://fallback")
+    assert ArxivCollector._get_abs_url(entry) == "http://fallback"
+
+
+def test_parse_date_handles_missing():
+    assert ArxivCollector._parse_date(SimpleNamespace(published_parsed=None)) is None
+
+
+def test_parse_date_parses_struct_time():
+    entry = SimpleNamespace(published_parsed=(2026, 5, 1, 12, 0, 0, 0, 0, 0))
+    parsed = ArxivCollector._parse_date(entry)
+    assert parsed is not None
+    assert parsed.year == 2026 and parsed.month == 5
diff --git a/tests/test_collectors/test_github_collector.py b/tests/test_collectors/test_github_collector.py
new file mode 100644
index 0000000..e0cf94a
--- /dev/null
+++ b/tests/test_collectors/test_github_collector.py
@@ -0,0 +1,116 @@
+"""Tests for the GitHub collector (HTTP mocked — no real network)."""
+
+from __future__ import annotations
+
+from src.collectors.github_collector import GitHubCollector
+from src.storage.models import Source
+from src.storage.repository import ArticleRepository, SourceRepository
+
+_TRENDING_HTML = """
+<html><body>
+  <article class="Box-row">
+    <h2><a href="/owner/cool-tester">owner / cool-tester</a></h2>
+    <p>An AI-powered test generation tool</p>
+    <a href="/owner/cool-tester/stargazers">1,234</a>
+    <span itemprop="programmingLanguage">Python</span>
+  </article>
+  <article class="Box-row">
+    <h2><a href="/acme/agentkit">acme / agentkit</a></h2>
+    <p>Agent framework</p>
+  </article>
+</body></html>
+"""
+
+_API_JSON = {
+    "items": [
+        {
+            "html_url": "https://github.com/foo/llm-qa",
+            "full_name": "foo/llm-qa",
+            "topics": ["testing", "llm"],
+            "stargazers_count": 4200,
+            "language": "Python",
+            "description": "LLM-assisted QA",
+        }
+    ]
+}
+
+
+class _HtmlResponse:
+    text = _TRENDING_HTML
+
+    def raise_for_status(self):
+        return None
+
+
+class _JsonResponse:
+    def raise_for_status(self):
+        return None
+
+    def json(self):
+        return _API_JSON
+
+
+def _collector(db_session) -> GitHubCollector:
+    return GitHubCollector(
+        source_repo=SourceRepository(db_session),
+        article_repo=ArticleRepository(db_session),
+        github_token=None,
+    )
+
+
+def _persist_source(db_session, source_type: str) -> Source:
+    src = Source(
+        name="GH",
+        url="https://github.com/trending",
+        source_type=source_type,
+        category="tools",
+    )
+    db_session.add(src)
+    db_session.flush()
+    return src
+
+
+def test_scrape_trending_parses_and_saves_repos(db_session, monkeypatch):
+    monkeypatch.setattr(
+        "src.collectors.github_collector.requests.get", lambda *a, **k: _HtmlResponse()
+    )
+    source = _persist_source(db_session, "github_trending")
+    collector = _collector(db_session)
+
+    new_count = collector._scrape_trending(source)
+
+    assert new_count >= 1
+    assert ArticleRepository(db_session).exists("https://github.com/owner/cool-tester")
+
+
+def test_search_topic_parses_api_json(db_session, monkeypatch):
+    monkeypatch.setattr(
+        "src.collectors.github_collector.requests.get", lambda *a, **k: _JsonResponse()
+    )
+    source = _persist_source(db_session, "github_api")
+    collector = _collector(db_session)
+
+    new_count = collector._search_topic(source, "testing")
+
+    assert new_count == 1
+    assert ArticleRepository(db_session).exists("https://github.com/foo/llm-qa")
+
+
+def test_search_topic_skips_existing(db_session, monkeypatch):
+    monkeypatch.setattr(
+        "src.collectors.github_collector.requests.get", lambda *a, **k: _JsonResponse()
+    )
+    source = _persist_source(db_session, "github_api")
+    collector = _collector(db_session)
+
+    collector._search_topic(source, "testing")
+    second = collector._search_topic(source, "testing")  # already stored
+
+    assert second == 0
+
+
+def test_collector_sets_auth_header_when_token_present(db_session):
+    collector = GitHubCollector(
+        SourceRepository(db_session), ArticleRepository(db_session), github_token="ghp_x"
+    )
+    assert collector._headers["Authorization"] == "Bearer ghp_x"
diff --git a/tests/test_collectors/test_rss_collector.py b/tests/test_collectors/test_rss_collector.py
index 2293db0..a1f017d 100644
--- a/tests/test_collectors/test_rss_collector.py
+++ b/tests/test_collectors/test_rss_collector.py
@@ -5,10 +5,8 @@
 
 from __future__ import annotations
 
-from datetime import datetime, timezone
 from unittest.mock import MagicMock, patch
 
-import pytest
 
 from src.collectors.rss_collector import RSSCollector, _parse_date, _extract_content
 from src.storage.repository import ArticleRepository, SourceRepository
diff --git a/tests/test_processors/test_keyword_extractor.py b/tests/test_processors/test_keyword_extractor.py
index e4bedb5..8cffc80 100644
--- a/tests/test_processors/test_keyword_extractor.py
+++ b/tests/test_processors/test_keyword_extractor.py
@@ -8,7 +8,6 @@
 import json
 from unittest.mock import MagicMock, patch
 
-import pytest
 
 from src.processors.keyword_extractor import KeywordExtractor
 from src.storage.models import Article
diff --git a/tests/test_processors/test_relevance_scorer.py b/tests/test_processors/test_relevance_scorer.py
index 25f25b6..4bffd17 100644
--- a/tests/test_processors/test_relevance_scorer.py
+++ b/tests/test_processors/test_relevance_scorer.py
@@ -7,7 +7,6 @@
 
 from datetime import datetime, timedelta, timezone
 
-import pytest
 
 from src.processors.relevance_scorer import RelevanceScorer, _CATEGORY_BONUSES
 from src.storage.models import Article, Source
@@ -151,7 +150,6 @@ def test_very_fresh_article_gets_maximum_freshness_bonus(self):
         assert score_fresh > score_old
 
     def test_very_old_article_gets_no_freshness_bonus(self):
-        from src.processors.relevance_scorer import RelevanceScorer
         scorer = _make_scorer()
         old_date = datetime.now(timezone.utc) - timedelta(days=30)
         bonus = scorer._freshness_bonus(old_date)
diff --git a/tests/test_processors/test_summarizer.py b/tests/test_processors/test_summarizer.py
index 0c4d29a..0a36eda 100644
--- a/tests/test_processors/test_summarizer.py
+++ b/tests/test_processors/test_summarizer.py
@@ -8,7 +8,6 @@
 import json
 from unittest.mock import MagicMock, patch
 
-import pytest
 import openai
 
 from src.processors.summarizer import Summarizer
diff --git a/tests/test_reports/test_report_generator.py b/tests/test_reports/test_report_generator.py
index abc827e..ea28301 100644
--- a/tests/test_reports/test_report_generator.py
+++ b/tests/test_reports/test_report_generator.py
@@ -9,7 +9,6 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
-import pytest
 
 from src.reports.report_generator import ReportGenerator
 from src.storage.models import Article, Source, Trend
diff --git a/tests/test_scheduler/__init__.py b/tests/test_scheduler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_scheduler/test_scheduler.py b/tests/test_scheduler/test_scheduler.py
new file mode 100644
index 0000000..fc74f5d
--- /dev/null
+++ b/tests/test_scheduler/test_scheduler.py
@@ -0,0 +1,50 @@
+"""Tests for the APScheduler wrapper (scheduler mocked — no blocking loop)."""
+
+from __future__ import annotations
+
+
+def test_scheduler_registers_job_and_starts(monkeypatch):
+    calls: dict[str, object] = {}
+
+    class FakeScheduler:
+        running = False
+
+        def __init__(self, **_kwargs) -> None:
+            pass
+
+        def add_job(self, **kwargs) -> None:
+            calls["job"] = kwargs
+
+        def start(self) -> None:
+            calls["started"] = True
+
+        def shutdown(self, **_kwargs) -> None:
+            pass
+
+    monkeypatch.setattr("src.scheduler.job_scheduler.BlockingScheduler", FakeScheduler)
+    monkeypatch.setattr("src.scheduler.job_scheduler.signal.signal", lambda *_a, **_k: None)
+
+    from src.scheduler.job_scheduler import AgentScheduler
+
+    sched = AgentScheduler(interval_hours=6)
+    assert sched._interval_hours == 6
+
+    sched.start()
+
+    assert calls["job"]["id"] == "pm_intelligence_agent"
+    assert calls["job"]["max_instances"] == 1
+    assert calls.get("started") is True
+
+
+def test_scheduler_uses_settings_default_interval(monkeypatch):
+    monkeypatch.setattr(
+        "src.scheduler.job_scheduler.BlockingScheduler",
+        lambda **_k: type("S", (), {"running": False, "shutdown": lambda *a, **k: None})(),
+    )
+    monkeypatch.setattr("src.scheduler.job_scheduler.signal.signal", lambda *_a, **_k: None)
+
+    from src.config.settings import settings
+    from src.scheduler.job_scheduler import AgentScheduler
+
+    sched = AgentScheduler()
+    assert sched._interval_hours == settings.schedule_interval_hours
diff --git a/tests/test_storage/test_repository.py b/tests/test_storage/test_repository.py
index a3f4b85..3c935c6 100644
--- a/tests/test_storage/test_repository.py
+++ b/tests/test_storage/test_repository.py
@@ -7,9 +7,8 @@
 
 from datetime import datetime, timedelta, timezone
 
-import pytest
 
-from src.storage.models import Article, Source, Trend
+from src.storage.models import Article
 from src.storage.repository import (
     AgentRunRepository,
     ArticleRepository,