diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 6917f27e9..a6a1419af 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -10,6 +10,9 @@ LXMLWebScrapingStrategy, WebScrapingStrategy, # Backward compatibility alias ) + +from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy + from .async_logger import ( AsyncLoggerBase, AsyncLogger, @@ -128,6 +131,8 @@ "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", + "PDFCrawlerStrategy", + "PDFContentScrapingStrategy", "FilterChain", "URLPatternFilter", "ContentTypeFilter", diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 58d8c01fe..4910bee18 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -13,8 +13,12 @@ from fastapi import HTTPException, Request, status from fastapi.background import BackgroundTasks from fastapi.responses import JSONResponse +from fastapi.encoders import jsonable_encoder + from redis import asyncio as aioredis +from utils import is_pdf_url + from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, @@ -31,6 +35,10 @@ BM25ContentFilter, LLMContentFilter ) + +from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy +# from crawl4ai.async_configs import to_serializable_dict + from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy @@ -431,6 +439,23 @@ async def handle_crawl_request( urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) + + is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) + is_pdf = any(is_pdf_flags) + if any(is_pdf_flags) and not all(is_pdf_flags): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Mix of PDF and non-PDF URLs in a single request is not supported yet." + ) + + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None + + if is_pdf and crawler_config.scraping_strategy is None: + crawler_config.scraping_strategy = PDFContentScrapingStrategy( + extract_images=False, + save_images_locally=False, + batch_size=2 + ) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], @@ -440,7 +465,7 @@ async def handle_crawl_request( ) from crawler_pool import get_crawler - crawler = await get_crawler(browser_config) + crawler = await get_crawler(browser_config, crawler_strategy) # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) # await crawler.start() @@ -458,6 +483,7 @@ async def handle_crawl_request( config=crawler_config, dispatcher=dispatcher) results = await partial_func() + results_list = results if isinstance(results, list) else [results] # await crawler.close() @@ -471,12 +497,14 @@ async def handle_crawl_request( # Process results to handle PDF bytes processed_results = [] - for result in results: + for result in results_list: result_dict = result.model_dump() # If PDF exists, encode it to base64 if result_dict.get('pdf') is not None: result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') - processed_results.append(result_dict) + + # Keep response shape consistent with streaming (plain JSON-serializable dict) + processed_results.append(jsonable_encoder(result_dict)) return { "success": True, @@ -521,18 +549,37 @@ async def handle_stream_crawl_request( # browser_config.verbose = True # Set to False or remove for production stress testing browser_config.verbose = False crawler_config = CrawlerRunConfig.load(crawler_config) - crawler_config.scraping_strategy = LXMLWebScrapingStrategy() crawler_config.stream = True + + # Normalize URLs to include scheme (match non-streaming behavior) + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] + + is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) + is_pdf = any(is_pdf_flags) + if any(is_pdf_flags) and not all(is_pdf_flags): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Mix of PDF and non-PDF URLs in a single request is not supported yet." + ) + + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None + + if is_pdf and crawler_config.scraping_strategy is None: + crawler_config.scraping_strategy = PDFContentScrapingStrategy( + extract_images=False, + save_images_locally=False, + batch_size=2 + ) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], rate_limiter=RateLimiter( base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"]) - ) + ) if config["crawler"]["rate_limiter"]["enabled"] else None ) from crawler_pool import get_crawler - crawler = await get_crawler(browser_config) + crawler = await get_crawler(browser_config, crawler_strategy) # crawler = AsyncWebCrawler(config=browser_config) # await crawler.start() diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index d15102e4d..4652c0d6d 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -1,11 +1,11 @@ # crawler_pool.py (new file) import asyncio, json, hashlib, time, psutil from contextlib import suppress -from typing import Dict +from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig -from typing import Dict from utils import load_config + CONFIG = load_config() POOL: Dict[str, AsyncWebCrawler] = {} @@ -15,20 +15,33 @@ MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min -def _sig(cfg: BrowserConfig) -> str: - payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":")) - return hashlib.sha1(payload.encode()).hexdigest() +def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: + """ + Generate a unique signature for a crawler based on browser config + and optional crawler strategy. This ensures that crawlers with + different strategies (e.g., PDF) are stored separately in the pool. + """ + payload = cfg.to_dict() + + if crawler_strategy is not None: + payload["strategy"] = crawler_strategy.__class__.__name__ + + json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(json_payload.encode()).hexdigest() + + -async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: +async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: + sig: Optional[str] = None try: - sig = _sig(cfg) + sig = _sig(cfg, crawler_strategy=crawler_strategy) async with LOCK: if sig in POOL: LAST_USED[sig] = time.time(); return POOL[sig] if psutil.virtual_memory().percent >= MEM_LIMIT: raise MemoryError("RAM pressure – new browser denied") - crawler = AsyncWebCrawler(config=cfg, thread_safe=False) + crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy) await crawler.start() POOL[sig] = crawler; LAST_USED[sig] = time.time() return crawler @@ -37,13 +50,15 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: - if sig in POOL: + if sig and sig in POOL: LAST_USED[sig] = time.time() else: # If we failed to start the browser, we should remove it from the pool - POOL.pop(sig, None) - LAST_USED.pop(sig, None) + if sig: + POOL.pop(sig, None) + LAST_USED.pop(sig, None) # If we failed to start the browser, we should remove it from the pool + async def close_all(): async with LOCK: await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True) diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index 2e2a80ac7..cf072b89a 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -2,6 +2,7 @@ import logging import yaml import os +import httpx from datetime import datetime from enum import Enum from pathlib import Path @@ -124,4 +125,37 @@ def verify_email_domain(email: str) -> bool: records = dns.resolver.resolve(domain, 'MX') return True if records else False except Exception as e: - return False \ No newline at end of file + return False + +async def is_pdf_url(url: str) -> bool: + """ + Check if a URL points to a PDF using httpx: + - Check extension + - Check Content-Type via HEAD request + - Check first 5 bytes (magic number) if needed + """ + if url.lower().endswith(".pdf"): + return True + + timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0) + async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client: + # HEAD request to check Content-Type (ignore servers that reject HEAD) + try: + head_resp = await client.head(url, headers={"Accept": "*/*"}) + content_type = head_resp.headers.get("content-type", "").lower() + if "application/pdf" in content_type: + return True + except httpx.HTTPError: + pass + + # Fallback: GET first 5 bytes to check PDF magic number + try: + get_resp = await client.get(url, headers={"Range": "bytes=0-4", "Accept": "*/*"}) + if get_resp.status_code in (200, 206): # 206 Partial Content + return get_resp.content.startswith(b"%PDF-") + except httpx.HTTPError: + return False + + # Default: not a PDF (or unable to determine) + return False + diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index 0a3d51af1..a0898109b 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -18,15 +18,14 @@ console = Console() # --- Configuration --- -BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Target URLs -SIMPLE_URL = "https://example.com" # For demo purposes SIMPLE_URL = "https://httpbin.org/html" LINKS_URL = "https://httpbin.org/links/10/0" FORMS_URL = "https://httpbin.org/forms/post" # For JS demo BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction PYTHON_URL = "https://python.org" # For deeper crawl +PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo # Use the same sample site as deep crawl tests for consistency DEEP_CRAWL_BASE_URL = os.getenv( "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") @@ -1261,6 +1260,73 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient): console.print( f"[bold red]Unexpected error during invalid test:[/] {e}") +# 10. Crawl PDF + +async def demo_pdf_crawl(client: httpx.AsyncClient): + payload = { + "urls": [PDF_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PDFContentScrapingStrategy", + "params": { + "extract_images": False, + "save_images_locally": False, + "batch_size": 2 + } + } + } + } + } + + resp = await client.post("/crawl", json=payload) + resp.raise_for_status() + data = resp.json() + print("=== Demo: PDF Crawl ===") + print("Success:", data.get("success")) + print("Number of results:", len(data.get("results", []))) + if data.get("results"): + first = data["results"][0] + text_snippet = (first.get("text") or "")[:500] + print("Extracted text (first 500 chars):") + print(text_snippet) + +# 11. Crawl PDF stream + +async def demo_pdf_crawl_stream(client: httpx.AsyncClient): + """ + Demo: Crawl a PDF and stream the extracted text content. + """ + payload = { + "urls": [PDF_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, + "cache_mode": "BYPASS", + "scraping_strategy": { # <-- Default strategy if not set + "type": "PDFContentScrapingStrategy", + "params": { + "extract_images": False, + "save_images_locally": False, + "batch_size": 2 + } + } + } + } + } + + await stream_request( + client, + "/crawl/stream", + payload, + "Demo PDF: Streaming PDF Crawl" + ) + # --- Update Main Runner to include new demo --- async def main_demo(): @@ -1294,6 +1360,9 @@ async def main_demo(): # await demo_deep_with_llm_extraction(client) # await demo_deep_with_proxy(client) # Skips if no PROXIES env var # await demo_deep_with_ssl(client) # Added the new demo + + # await demo_pdf_crawl_stream(client) + # await demo_pdf_crawl(client) # --- Helper endpoints --- await demo_markdown_endpoint(client) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py new file mode 100644 index 000000000..6068a990b --- /dev/null +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -0,0 +1,134 @@ +# ==== File: test_rest_api_pdf_crawl.py ==== + +import pytest +import pytest_asyncio +import httpx +import json +import asyncio +import os +from typing import List, Dict, Any, AsyncGenerator + +from dotenv import load_dotenv +load_dotenv() # Load environment variables from .env file if present + +# --- Test Configuration --- +BASE_URL = os.getenv( + "CRAWL4AI_TEST_URL", + "http://localhost:11235", # Docker default; override via env for dev/debug (e.g., 8020) +) +PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825.pdf" +PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" + +# --- Helper Functions --- + +async def check_server_health(client: httpx.AsyncClient): + """Check if the server is healthy before running tests.""" + try: + response = await client.get("/health") + response.raise_for_status() + print(f"\nServer healthy: {response.json()}") + return True + except (httpx.RequestError, httpx.HTTPStatusError) as e: + pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) + +async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False): + """Asserts the basic structure of a single crawl result.""" + assert isinstance(result, dict) + assert "url" in result + assert "success" in result + assert "html" in result # Basic crawls should return HTML + assert "metadata" in result + assert isinstance(result["metadata"], dict) + assert "depth" in result["metadata"] # Deep crawls add depth + + if check_ssl: + assert "ssl_certificate" in result # Check if SSL info is present + assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None + + +# --- Pytest Fixtures --- +@pytest_asyncio.fixture(scope="function") +async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: + """Provides an async HTTP client""" + # Increased timeout for potentially longer deep crawls + async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: + yield client + # No explicit close needed with 'async with' + +# --- Test Class for PDF Scraping --- +@pytest.mark.asyncio +class TestPdfScraping: + + @pytest_asyncio.fixture(autouse=True) + async def check_health_before_tests(self, async_client: httpx.AsyncClient): + """Fixture to ensure server is healthy before each test in the class.""" + await check_server_health(async_client) + + async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): + """Test basic PDF scraping for a single PDF URL.""" + payload = { + "urls": [PDF_TEST_URL], # URL of a test PDF + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PDFContentScrapingStrategy", # Custom PDF scraping strategy + "params": {} + }, + } + } + } + + response = await async_client.post("/crawl", json=payload) + response.raise_for_status() + data = response.json() + + assert data["success"] is True + assert len(data["results"]) == 1 + + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + + content = result.get("extracted_content") + extracted_text = content.get("text", "") if isinstance(content, dict) else (content or "") + assert isinstance(extracted_text, str) and len(extracted_text) > 0 + + async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient): + """Test PDF scraping when PDF is not accessible.""" + payload = { + "urls": [PDF_TEST_INVALID_URL], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PDFContentScrapingStrategy", + "params": {} + }, + } + } + } + + response = await async_client.post("/crawl", json=payload) + + data = response.json() + assert data["success"] is True + result = data["results"][0] + assert result["success"] is False + assert "extracted_content" not in result or result["extracted_content"] is None + + +# --- Main Execution Block (for running script directly) --- +if __name__ == "__main__": + pytest_args = ["-v", "-s", __file__] + # Example: Run only proxy test + # pytest_args.append("-k test_deep_crawl_with_proxies") + print(f"Running pytest with args: {pytest_args}") + exit_code = pytest.main(pytest_args) + print(f"Pytest finished with exit code: {exit_code}") \ No newline at end of file