Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,5 @@ data/
sessions/
backups/
uploads/
.uploads/
config/mcp.json
83 changes: 79 additions & 4 deletions src/bot/features/file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
import zipfile
from collections import defaultdict
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Optional

from telegram import Document

Expand Down Expand Up @@ -131,12 +132,23 @@ def __init__(self, config: Settings, security: SecurityValidator):
}

async def handle_document_upload(
self, document: Document, user_id: int, context: str = ""
self,
document: Document,
user_id: int,
context: str = "",
current_dir: Optional[Path] = None,
) -> ProcessedFile:
"""Process uploaded document"""
"""Process uploaded document.

current_dir is used by the "document" branch to persist binary files
(PDF, etc.) into <current_dir>/.uploads/ where Claude can read them via
the Read tool. Defaults to approved_directory when omitted.
"""

# Download file
file_path = await self._download_file(document)
original_name = document.file_name or file_path.name
persist_root = current_dir or Path(self.config.approved_directory)

try:
# Detect file type
Expand All @@ -145,6 +157,10 @@ async def handle_document_upload(
# Process based on type
if file_type == "archive":
return await self._process_archive(file_path, context)
elif file_type == "document":
return await self._process_document_file(
file_path, context, persist_root, original_name
)
elif file_type == "code":
return await self._process_code_file(file_path, context)
elif file_type == "text":
Expand All @@ -153,7 +169,8 @@ async def handle_document_upload(
raise ValueError(f"Unsupported file type: {file_type}")

finally:
# Cleanup
# Cleanup temp download. Document branch has already copied the
# payload to <persist_root>/.uploads/ — that copy survives.
file_path.unlink(missing_ok=True)

async def _download_file(self, document: Document) -> Path:
Expand All @@ -170,6 +187,20 @@ async def _download_file(self, document: Document) -> Path:

return file_path

# Binary document formats: saved to disk for Claude to read via the
# appropriate tool (Read for PDF; Bash with pandoc/python-docx/openpyxl/
# unzip for Office and OpenDocument formats).
document_extensions = {
".pdf",
".docx",
".xlsx",
".pptx",
".odt",
".ods",
".odp",
".rtf",
}

def _detect_file_type(self, file_path: Path) -> str:
"""Detect file type based on extension and content"""
ext = file_path.suffix.lower()
Expand All @@ -178,6 +209,10 @@ def _detect_file_type(self, file_path: Path) -> str:
if ext in {".zip", ".tar", ".gz", ".bz2", ".xz", ".7z"}:
return "archive"

# Check if document (binary — saved to disk, read by Claude tools)
if ext in self.document_extensions:
return "document"

# Check if code
if ext in self.code_extensions:
return "code"
Expand Down Expand Up @@ -283,6 +318,46 @@ async def _process_code_file(self, file_path: Path, context: str) -> ProcessedFi
},
)

async def _process_document_file(
self,
file_path: Path,
context: str,
persist_root: Path,
original_name: str,
) -> ProcessedFile:
"""Persist a binary document to <persist_root>/.uploads/ and return a
prompt telling Claude to read it via the Read tool.

The copy survives this call (the caller's `finally` cleans up only the
temp download), so Claude can read the file during the conversation and
even in follow-up turns.
"""
uploads_dir = persist_root / ".uploads"
uploads_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S-%f")[:-3]
safe_name = f"{timestamp}-{original_name}"
target = uploads_dir / safe_name

shutil.copy2(file_path, target)

prompt = (
f"{context or 'User uploaded a file:'}\n\n"
f"Path: `{target}`\n\n"
"Read the file using the appropriate tool for its format "
"and answer the user's question."
)

return ProcessedFile(
type="document",
prompt=prompt,
metadata={
"saved_path": str(target),
"original_name": original_name,
"size": target.stat().st_size,
},
)

async def _process_text_file(self, file_path: Path, context: str) -> ProcessedFile:
"""Process text file"""
content = file_path.read_text(encoding="utf-8", errors="ignore")
Expand Down
15 changes: 11 additions & 4 deletions src/bot/handlers/message.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Message handlers for non-command inputs."""

import asyncio
from pathlib import Path
from typing import Optional

import structlog
Expand Down Expand Up @@ -725,6 +726,13 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
parse_mode="HTML",
)

# current_dir is needed by the document branch of file_handler to
# persist binary uploads (PDF etc.) into <current_dir>/.uploads/ where
# Claude can reach them via its Read tool.
current_dir = Path(
context.user_data.get("current_directory", settings.approved_directory)
)

# Check if enhanced file handler is available
features = context.bot_data.get("features")
file_handler = features.get_file_handler() if features else None
Expand All @@ -736,6 +744,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
document,
user_id,
update.message.caption or "Please review this file:",
current_dir=current_dir,
)
prompt = processed_file.prompt

Expand Down Expand Up @@ -781,6 +790,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
"• Source code files (.py, .js, .ts, etc.)\n"
"• Text files (.txt, .md)\n"
"• Configuration files (.json, .yaml, .toml)\n"
"• Documents (.pdf)\n"
"• Documentation files",
parse_mode="HTML",
)
Expand All @@ -805,10 +815,7 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
)
return

# Get current directory and session
current_dir = context.user_data.get(
"current_directory", settings.approved_directory
)
# current_dir was already resolved above for the file_handler call.
session_id = context.user_data.get("claude_session_id")

# Process with Claude
Expand Down
12 changes: 8 additions & 4 deletions src/bot/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1192,6 +1192,13 @@ async def agentic_document(
await chat.send_action("typing")
progress_msg = await update.message.reply_text("Working...")

# current_dir is needed by the document branch of file_handler to persist
# binary uploads (PDF etc.) into <current_dir>/.uploads/ where Claude
# can reach them via its Read tool.
current_dir = Path(
context.user_data.get("current_directory", self.settings.approved_directory)
)

# Try enhanced file handler, fall back to basic
features = context.bot_data.get("features")
file_handler = features.get_file_handler() if features else None
Expand All @@ -1203,6 +1210,7 @@ async def agentic_document(
document,
user_id,
update.message.caption or "Please review this file:",
current_dir=current_dir,
)
prompt = processed_file.prompt
except Exception:
Expand Down Expand Up @@ -1233,10 +1241,6 @@ async def agentic_document(
"Claude integration not available. Check configuration."
)
return

current_dir = context.user_data.get(
"current_directory", self.settings.approved_directory
)
session_id = context.user_data.get("claude_session_id")

# Check if /new was used — skip auto-resume for this first message.
Expand Down
17 changes: 17 additions & 0 deletions src/security/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,23 @@ class SecurityValidator:
".vue",
".svelte",
".lock",
# Text-compatible document formats (UTF-8, handled inline)
".csv",
".tsv",
".log",
".ics",
".eml",
# Binary document formats (persisted to .uploads/ for Claude to read
# via the appropriate tool — Read for PDF, Bash with converters for
# Office and OpenDocument formats)
".pdf",
".docx",
".xlsx",
".pptx",
".odt",
".ods",
".odp",
".rtf",
}

# Forbidden filenames and patterns
Expand Down
143 changes: 143 additions & 0 deletions tests/unit/test_bot/test_file_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Tests for the FileHandler document branch."""

import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock

import pytest

from src.bot.features.file_handler import FileHandler
from src.config import create_test_config


@pytest.fixture
def tmp_dir():
with tempfile.TemporaryDirectory() as d:
yield Path(d)


@pytest.fixture
def file_handler(tmp_dir):
settings = create_test_config(approved_directory=str(tmp_dir))
security = MagicMock()
return FileHandler(settings, security)


@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"contract.docx",
"budget.xlsx",
"deck.pptx",
"notes.odt",
"letter.rtf",
],
)
def test_detect_file_type_binary_document_formats_return_document(
file_handler, filename
):
assert file_handler._detect_file_type(Path(filename)) == "document"


def test_detect_file_type_unknown_binary_stays_binary(file_handler, tmp_dir):
# A file with unknown extension + non-UTF-8 bytes should not become "document"
# — only whitelisted extensions in `document_extensions` do.
blob = tmp_dir / "mystery.blob"
blob.write_bytes(b"\xff\xfe\x00\x01binarydata")
assert file_handler._detect_file_type(blob) == "binary"


def test_detect_file_type_csv_goes_through_text_branch(file_handler, tmp_dir):
"""CSV and similar UTF-8 tabular formats aren't in `document_extensions` —
they fall through to the text branch (inline content in the prompt)."""
csv_file = tmp_dir / "data.csv"
csv_file.write_text("a,b,c\n1,2,3\n", encoding="utf-8")
assert file_handler._detect_file_type(csv_file) == "text"


async def test_process_document_file_copies_and_builds_prompt(file_handler, tmp_dir):
"""_process_document_file copies PDF to <persist_root>/.uploads/ with a
timestamp prefix, leaves it in place, and embeds the absolute path in
the returned prompt.
"""
source = tmp_dir / "src_ticket.pdf"
source.write_bytes(b"%PDF-1.4\nSMOKE-TOKEN-42\n%%EOF\n")

result = await file_handler._process_document_file(
source,
context="please summarize",
persist_root=tmp_dir,
original_name="ticket.pdf",
)

uploads_dir = tmp_dir / ".uploads"
saved_files = list(uploads_dir.glob("*-ticket.pdf"))
assert len(saved_files) == 1
assert saved_files[0].read_bytes() == source.read_bytes()

assert result.type == "document"
assert str(saved_files[0]) in result.prompt
assert "please summarize" in result.prompt
assert result.metadata["saved_path"] == str(saved_files[0])
assert result.metadata["original_name"] == "ticket.pdf"
assert result.metadata["size"] == source.stat().st_size


async def test_handle_document_upload_pdf_end_to_end(file_handler, tmp_dir):
"""handle_document_upload routes .pdf to the document branch and persists
the file into current_dir/.uploads/."""
pdf_bytes = b"%PDF-1.4\nSMOKE\n%%EOF\n"

async def fake_download(target_path):
Path(target_path).write_bytes(pdf_bytes)

tg_file = MagicMock()
tg_file.download_to_drive = AsyncMock(side_effect=fake_download)

document = MagicMock()
document.file_name = "ticket.pdf"
document.get_file = AsyncMock(return_value=tg_file)

# Use a subdirectory as current_dir to verify the handler honours it over
# the default approved_directory.
current_dir = tmp_dir / "project"
current_dir.mkdir()

result = await file_handler.handle_document_upload(
document,
user_id=1,
context="summarize",
current_dir=current_dir,
)

uploads_dir = current_dir / ".uploads"
saved_files = list(uploads_dir.glob("*-ticket.pdf"))
assert len(saved_files) == 1
assert saved_files[0].read_bytes() == pdf_bytes
assert result.type == "document"
assert str(saved_files[0]) in result.prompt


async def test_handle_document_upload_defaults_to_approved_directory(
file_handler, tmp_dir
):
"""Without explicit current_dir, document branch falls back to
approved_directory — backward-compatible behaviour."""
pdf_bytes = b"%PDF-1.4\nfallback\n%%EOF\n"

async def fake_download(target_path):
Path(target_path).write_bytes(pdf_bytes)

tg_file = MagicMock()
tg_file.download_to_drive = AsyncMock(side_effect=fake_download)

document = MagicMock()
document.file_name = "fallback.pdf"
document.get_file = AsyncMock(return_value=tg_file)

result = await file_handler.handle_document_upload(document, user_id=1, context="")

saved_files = list((tmp_dir / ".uploads").glob("*-fallback.pdf"))
assert len(saved_files) == 1
assert result.type == "document"
Loading