diff --git a/.gitignore b/.gitignore index 6e3390e3a..7bd8f44b7 100644 --- a/.gitignore +++ b/.gitignore @@ -146,4 +146,5 @@ data/ sessions/ backups/ uploads/ +.uploads/ config/mcp.json \ No newline at end of file diff --git a/src/bot/features/file_handler.py b/src/bot/features/file_handler.py index 5454c0651..16dcc4abf 100644 --- a/src/bot/features/file_handler.py +++ b/src/bot/features/file_handler.py @@ -14,8 +14,9 @@ import zipfile from collections import defaultdict from dataclasses import dataclass +from datetime import UTC, datetime from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional from telegram import Document @@ -131,12 +132,23 @@ def __init__(self, config: Settings, security: SecurityValidator): } async def handle_document_upload( - self, document: Document, user_id: int, context: str = "" + self, + document: Document, + user_id: int, + context: str = "", + current_dir: Optional[Path] = None, ) -> ProcessedFile: - """Process uploaded document""" + """Process uploaded document. + + current_dir is used by the "document" branch to persist binary files + (PDF, etc.) into /.uploads/ where Claude can read them via + the Read tool. Defaults to approved_directory when omitted. + """ # Download file file_path = await self._download_file(document) + original_name = document.file_name or file_path.name + persist_root = current_dir or Path(self.config.approved_directory) try: # Detect file type @@ -145,6 +157,10 @@ async def handle_document_upload( # Process based on type if file_type == "archive": return await self._process_archive(file_path, context) + elif file_type == "document": + return await self._process_document_file( + file_path, context, persist_root, original_name + ) elif file_type == "code": return await self._process_code_file(file_path, context) elif file_type == "text": @@ -153,7 +169,8 @@ async def handle_document_upload( raise ValueError(f"Unsupported file type: {file_type}") finally: - # Cleanup + # Cleanup temp download. Document branch has already copied the + # payload to /.uploads/ — that copy survives. file_path.unlink(missing_ok=True) async def _download_file(self, document: Document) -> Path: @@ -170,6 +187,20 @@ async def _download_file(self, document: Document) -> Path: return file_path + # Binary document formats: saved to disk for Claude to read via the + # appropriate tool (Read for PDF; Bash with pandoc/python-docx/openpyxl/ + # unzip for Office and OpenDocument formats). + document_extensions = { + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".odt", + ".ods", + ".odp", + ".rtf", + } + def _detect_file_type(self, file_path: Path) -> str: """Detect file type based on extension and content""" ext = file_path.suffix.lower() @@ -178,6 +209,10 @@ def _detect_file_type(self, file_path: Path) -> str: if ext in {".zip", ".tar", ".gz", ".bz2", ".xz", ".7z"}: return "archive" + # Check if document (binary — saved to disk, read by Claude tools) + if ext in self.document_extensions: + return "document" + # Check if code if ext in self.code_extensions: return "code" @@ -283,6 +318,46 @@ async def _process_code_file(self, file_path: Path, context: str) -> ProcessedFi }, ) + async def _process_document_file( + self, + file_path: Path, + context: str, + persist_root: Path, + original_name: str, + ) -> ProcessedFile: + """Persist a binary document to /.uploads/ and return a + prompt telling Claude to read it via the Read tool. + + The copy survives this call (the caller's `finally` cleans up only the + temp download), so Claude can read the file during the conversation and + even in follow-up turns. + """ + uploads_dir = persist_root / ".uploads" + uploads_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S-%f")[:-3] + safe_name = f"{timestamp}-{original_name}" + target = uploads_dir / safe_name + + shutil.copy2(file_path, target) + + prompt = ( + f"{context or 'User uploaded a file:'}\n\n" + f"Path: `{target}`\n\n" + "Read the file using the appropriate tool for its format " + "and answer the user's question." + ) + + return ProcessedFile( + type="document", + prompt=prompt, + metadata={ + "saved_path": str(target), + "original_name": original_name, + "size": target.stat().st_size, + }, + ) + async def _process_text_file(self, file_path: Path, context: str) -> ProcessedFile: """Process text file""" content = file_path.read_text(encoding="utf-8", errors="ignore") diff --git a/src/bot/handlers/message.py b/src/bot/handlers/message.py index bbd240840..f73efa050 100644 --- a/src/bot/handlers/message.py +++ b/src/bot/handlers/message.py @@ -1,6 +1,7 @@ """Message handlers for non-command inputs.""" import asyncio +from pathlib import Path from typing import Optional import structlog @@ -725,6 +726,13 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) -> parse_mode="HTML", ) + # current_dir is needed by the document branch of file_handler to + # persist binary uploads (PDF etc.) into /.uploads/ where + # Claude can reach them via its Read tool. + current_dir = Path( + context.user_data.get("current_directory", settings.approved_directory) + ) + # Check if enhanced file handler is available features = context.bot_data.get("features") file_handler = features.get_file_handler() if features else None @@ -736,6 +744,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) -> document, user_id, update.message.caption or "Please review this file:", + current_dir=current_dir, ) prompt = processed_file.prompt @@ -781,6 +790,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) -> "• Source code files (.py, .js, .ts, etc.)\n" "• Text files (.txt, .md)\n" "• Configuration files (.json, .yaml, .toml)\n" + "• Documents (.pdf)\n" "• Documentation files", parse_mode="HTML", ) @@ -805,10 +815,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) -> ) return - # Get current directory and session - current_dir = context.user_data.get( - "current_directory", settings.approved_directory - ) + # current_dir was already resolved above for the file_handler call. session_id = context.user_data.get("claude_session_id") # Process with Claude diff --git a/src/bot/orchestrator.py b/src/bot/orchestrator.py index 6d9719f0d..b71e7a33d 100644 --- a/src/bot/orchestrator.py +++ b/src/bot/orchestrator.py @@ -1192,6 +1192,13 @@ async def agentic_document( await chat.send_action("typing") progress_msg = await update.message.reply_text("Working...") + # current_dir is needed by the document branch of file_handler to persist + # binary uploads (PDF etc.) into /.uploads/ where Claude + # can reach them via its Read tool. + current_dir = Path( + context.user_data.get("current_directory", self.settings.approved_directory) + ) + # Try enhanced file handler, fall back to basic features = context.bot_data.get("features") file_handler = features.get_file_handler() if features else None @@ -1203,6 +1210,7 @@ async def agentic_document( document, user_id, update.message.caption or "Please review this file:", + current_dir=current_dir, ) prompt = processed_file.prompt except Exception: @@ -1233,10 +1241,6 @@ async def agentic_document( "Claude integration not available. Check configuration." ) return - - current_dir = context.user_data.get( - "current_directory", self.settings.approved_directory - ) session_id = context.user_data.get("claude_session_id") # Check if /new was used — skip auto-resume for this first message. diff --git a/src/security/validators.py b/src/security/validators.py index 381ba321d..9992b5be2 100644 --- a/src/security/validators.py +++ b/src/security/validators.py @@ -86,6 +86,23 @@ class SecurityValidator: ".vue", ".svelte", ".lock", + # Text-compatible document formats (UTF-8, handled inline) + ".csv", + ".tsv", + ".log", + ".ics", + ".eml", + # Binary document formats (persisted to .uploads/ for Claude to read + # via the appropriate tool — Read for PDF, Bash with converters for + # Office and OpenDocument formats) + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".odt", + ".ods", + ".odp", + ".rtf", } # Forbidden filenames and patterns diff --git a/tests/unit/test_bot/test_file_handler.py b/tests/unit/test_bot/test_file_handler.py new file mode 100644 index 000000000..1beaf81bd --- /dev/null +++ b/tests/unit/test_bot/test_file_handler.py @@ -0,0 +1,143 @@ +"""Tests for the FileHandler document branch.""" + +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from src.bot.features.file_handler import FileHandler +from src.config import create_test_config + + +@pytest.fixture +def tmp_dir(): + with tempfile.TemporaryDirectory() as d: + yield Path(d) + + +@pytest.fixture +def file_handler(tmp_dir): + settings = create_test_config(approved_directory=str(tmp_dir)) + security = MagicMock() + return FileHandler(settings, security) + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "contract.docx", + "budget.xlsx", + "deck.pptx", + "notes.odt", + "letter.rtf", + ], +) +def test_detect_file_type_binary_document_formats_return_document( + file_handler, filename +): + assert file_handler._detect_file_type(Path(filename)) == "document" + + +def test_detect_file_type_unknown_binary_stays_binary(file_handler, tmp_dir): + # A file with unknown extension + non-UTF-8 bytes should not become "document" + # — only whitelisted extensions in `document_extensions` do. + blob = tmp_dir / "mystery.blob" + blob.write_bytes(b"\xff\xfe\x00\x01binarydata") + assert file_handler._detect_file_type(blob) == "binary" + + +def test_detect_file_type_csv_goes_through_text_branch(file_handler, tmp_dir): + """CSV and similar UTF-8 tabular formats aren't in `document_extensions` — + they fall through to the text branch (inline content in the prompt).""" + csv_file = tmp_dir / "data.csv" + csv_file.write_text("a,b,c\n1,2,3\n", encoding="utf-8") + assert file_handler._detect_file_type(csv_file) == "text" + + +async def test_process_document_file_copies_and_builds_prompt(file_handler, tmp_dir): + """_process_document_file copies PDF to /.uploads/ with a + timestamp prefix, leaves it in place, and embeds the absolute path in + the returned prompt. + """ + source = tmp_dir / "src_ticket.pdf" + source.write_bytes(b"%PDF-1.4\nSMOKE-TOKEN-42\n%%EOF\n") + + result = await file_handler._process_document_file( + source, + context="please summarize", + persist_root=tmp_dir, + original_name="ticket.pdf", + ) + + uploads_dir = tmp_dir / ".uploads" + saved_files = list(uploads_dir.glob("*-ticket.pdf")) + assert len(saved_files) == 1 + assert saved_files[0].read_bytes() == source.read_bytes() + + assert result.type == "document" + assert str(saved_files[0]) in result.prompt + assert "please summarize" in result.prompt + assert result.metadata["saved_path"] == str(saved_files[0]) + assert result.metadata["original_name"] == "ticket.pdf" + assert result.metadata["size"] == source.stat().st_size + + +async def test_handle_document_upload_pdf_end_to_end(file_handler, tmp_dir): + """handle_document_upload routes .pdf to the document branch and persists + the file into current_dir/.uploads/.""" + pdf_bytes = b"%PDF-1.4\nSMOKE\n%%EOF\n" + + async def fake_download(target_path): + Path(target_path).write_bytes(pdf_bytes) + + tg_file = MagicMock() + tg_file.download_to_drive = AsyncMock(side_effect=fake_download) + + document = MagicMock() + document.file_name = "ticket.pdf" + document.get_file = AsyncMock(return_value=tg_file) + + # Use a subdirectory as current_dir to verify the handler honours it over + # the default approved_directory. + current_dir = tmp_dir / "project" + current_dir.mkdir() + + result = await file_handler.handle_document_upload( + document, + user_id=1, + context="summarize", + current_dir=current_dir, + ) + + uploads_dir = current_dir / ".uploads" + saved_files = list(uploads_dir.glob("*-ticket.pdf")) + assert len(saved_files) == 1 + assert saved_files[0].read_bytes() == pdf_bytes + assert result.type == "document" + assert str(saved_files[0]) in result.prompt + + +async def test_handle_document_upload_defaults_to_approved_directory( + file_handler, tmp_dir +): + """Without explicit current_dir, document branch falls back to + approved_directory — backward-compatible behaviour.""" + pdf_bytes = b"%PDF-1.4\nfallback\n%%EOF\n" + + async def fake_download(target_path): + Path(target_path).write_bytes(pdf_bytes) + + tg_file = MagicMock() + tg_file.download_to_drive = AsyncMock(side_effect=fake_download) + + document = MagicMock() + document.file_name = "fallback.pdf" + document.get_file = AsyncMock(return_value=tg_file) + + result = await file_handler.handle_document_upload(document, user_id=1, context="") + + saved_files = list((tmp_dir / ".uploads").glob("*-fallback.pdf")) + assert len(saved_files) == 1 + assert result.type == "document" diff --git a/tests/unit/test_security/test_validators.py b/tests/unit/test_security/test_validators.py index a15f5c31a..a5575f041 100644 --- a/tests/unit/test_security/test_validators.py +++ b/tests/unit/test_security/test_validators.py @@ -139,6 +139,12 @@ def test_filename_validation_valid(self, validator): "style.css", "data.sql", "build.sh", + "report.pdf", + "contract.docx", + "budget.xlsx", + "data.csv", + "access.log", + "invite.ics", ] for filename in valid_filenames: @@ -146,6 +152,12 @@ def test_filename_validation_valid(self, validator): assert valid is True assert error is None + def test_filename_pdf_exe_suffix_blocked(self, validator): + """Regression: `.pdf.exe` trap name is still blocked by dangerous patterns.""" + valid, error = validator.validate_filename("ticket.pdf.exe") + assert valid is False + assert "not allowed" in error + def test_filename_validation_invalid_extensions(self, validator): """Test rejection of invalid file extensions.""" invalid_filenames = [