Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bf9ae80
feat: claude integration
Prajna1999 May 27, 2026
b279ff7
feat: adapter for google-vertex
Prajna1999 May 28, 2026
8f5e6b5
feat: add boto3 deps for AWS key service and audio path refactor
Prajna1999 May 31, 2026
8cc64b1
fea: BYOK for secrets manager
Prajna1999 Jun 1, 2026
115bea2
Merge branch 'main' into feature/gemini-vertex-gemini-enterprise
Prajna1999 Jun 1, 2026
7fc2ad1
Merge branch 'main' into feature/gemini-vertex-gemini-enterprise
Prajna1999 Jun 2, 2026
a7eff4b
feat: remove secrets manager
Prajna1999 Jun 2, 2026
6a0fdb8
fix: mask dicts
Prajna1999 Jun 2, 2026
5255ffa
Merge branch 'main' into feature/claude-integration
Prajna1999 Jun 2, 2026
7a7b7ed
Merge branch 'feature/gemini-vertex-gemini-enterprise' into feature/c…
Prajna1999 Jun 2, 2026
1fb5a53
feat: files api beta support in Claude
Prajna1999 Jun 2, 2026
9ff5c72
feat: claude PDF/Image intergration
Prajna1999 Jun 2, 2026
3bd2206
Merge branch 'main' into feature/claude-integration
Prajna1999 Jun 4, 2026
8afea28
fix: remove anthropic, vertex model
Prajna1999 Jun 4, 2026
b6605c4
fix: json file instead of json file path
Prajna1999 Jun 4, 2026
534c5eb
Merge branch 'main' into feature/claude-integration
Prajna1999 Jun 4, 2026
3c387e7
feat: make audio download path airtight
Prajna1999 Jun 4, 2026
9b3494f
Merge remote-tracking branch 'refs/remotes/origin/feature/claude-inte…
Prajna1999 Jun 4, 2026
3baa4b2
chore: comment all claude test cases
Prajna1999 Jun 4, 2026
ab82e49
fix: test cases
Prajna1999 Jun 4, 2026
c93e77d
fix: one more gai test cases
Prajna1999 Jun 4, 2026
bf1565a
fix: anthropic mapper test cases
Prajna1999 Jun 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 44 additions & 6 deletions backend/app/core/audio_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,56 @@
"""
Audio processing utilities for format conversion.
"""Audio processing utilities: format conversion + STT input carrier."""

This module provides utilities for converting audio between different formats,
particularly for TTS output post-processing.
"""
import io
import logging
import os
import tempfile
import wave
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Iterator

from pydub import AudioSegment


logger = logging.getLogger(__name__)

_MIME_TO_EXT = {
"audio/wav": ".wav",
"audio/mpeg": ".mp3",
"audio/mp3": ".mp3",
"audio/ogg": ".ogg",
"audio/flac": ".flac",
"audio/webm": ".webm",
"audio/mp4": ".mp4",
"audio/m4a": ".m4a",
"audio/aac": ".aac",
"audio/aiff": ".aiff",
}
Comment on lines +16 to +27
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Add the missing WAV MIME aliases here.

AudioRef.to_path() now relies on this table to pick the temp-file suffix, but audio/wave and audio/x-wav are missing even though backend/app/utils.py:get_file_extension() already treats them as WAV. Valid WAV uploads using those MIME types will therefore materialize as *.audio, which can break SDKs that infer the format from the filename extension.

Suggested fix
 _MIME_TO_EXT = {
     "audio/wav": ".wav",
+    "audio/wave": ".wav",
+    "audio/x-wav": ".wav",
     "audio/mpeg": ".mp3",
     "audio/mp3": ".mp3",
     "audio/ogg": ".ogg",
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/core/audio_utils.py` around lines 16 - 27, The _MIME_TO_EXT
mapping is missing WAV aliases so AudioRef.to_path() can pick the proper .wav
suffix; add "audio/wave" and "audio/x-wav" entries mapping to ".wav" in the
_MIME_TO_EXT dict (the same canonical value used for "audio/wav") so it matches
backend/app/utils.py:get_file_extension() behavior and prevents temp files from
being created with a generic ".audio" extension.



@dataclass(frozen=True)
class AudioRef:
"""In-memory STT input. Providers consume ``bytes_`` directly or call
``to_path()`` when an SDK needs a filesystem path. Temp files are owned
by the provider's ``with`` scope — no framework-level cleanup needed.
"""

bytes_: bytes
mime_type: str = "audio/wav"

@contextmanager
def to_path(self) -> Iterator[str]:
ext = _MIME_TO_EXT.get(self.mime_type, ".audio")
tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False, prefix="audio_")
try:
tmp.write(self.bytes_)
tmp.close()
yield tmp.name
finally:
try:
os.unlink(tmp.name)
except OSError:
pass


def convert_pcm_to_mp3(
pcm_bytes: bytes, sample_rate: int = 24000
Expand Down
1 change: 1 addition & 0 deletions backend/app/core/cloud/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
CloudStorage,
CloudStorageError,
get_cloud_storage,
upload_audio_to_gcs,
)
156 changes: 136 additions & 20 deletions backend/app/core/cloud/storage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import mimetypes
import filetype
from sqlmodel import Session
from uuid import UUID
from uuid import UUID, uuid4
import logging
import functools as ft
from pathlib import Path
Expand All @@ -12,10 +14,19 @@
from fastapi import UploadFile
from botocore.exceptions import ClientError
from botocore.response import StreamingBody
from google.cloud import storage as gcs
from google.oauth2 import service_account

from app.crud import get_project_by_id
from app.core.config import settings
from app.utils import mask_string


def _mask(value: str | None) -> str:
# Lazy to break a top-level cycle: app.utils transitively imports
# app.services.llm.providers, which imports this module.
from app.utils import mask_string

return mask_string(value)


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,7 +57,7 @@ def create(self):
except ValueError as err:
logger.error(
f"[AmazonCloudStorageClient.create] Invalid bucket configuration | "
f"{{'bucket': '{mask_string(settings.AWS_S3_BUCKET)}', 'error': '{str(err)}'}}",
f"{{'bucket': '{_mask(settings.AWS_S3_BUCKET)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(err) from err
Expand All @@ -55,13 +66,13 @@ def create(self):
if response != 404:
logger.error(
f"[AmazonCloudStorageClient.create] Unexpected AWS error | "
f"{{'bucket': '{mask_string(settings.AWS_S3_BUCKET)}', 'error': '{str(err)}', 'code': {response}}}",
f"{{'bucket': '{_mask(settings.AWS_S3_BUCKET)}', 'error': '{str(err)}', 'code': {response}}}",
exc_info=True,
)
raise CloudStorageError(err) from err
logger.warning(
f"[AmazonCloudStorageClient.create] Bucket not found, creating | "
f"{{'bucket': '{mask_string(settings.AWS_S3_BUCKET)}'}}"
f"{{'bucket': '{_mask(settings.AWS_S3_BUCKET)}'}}"
)
try:
self.client.create_bucket(
Expand All @@ -72,12 +83,12 @@ def create(self):
)
logger.info(
f"[AmazonCloudStorageClient.create] Bucket created successfully | "
f"{{'bucket': '{mask_string(settings.AWS_S3_BUCKET)}'}}"
f"{{'bucket': '{_mask(settings.AWS_S3_BUCKET)}'}}"
)
except ClientError as create_err:
logger.error(
f"[AmazonCloudStorageClient.create] Failed to create bucket | "
f"{{'bucket': '{mask_string(settings.AWS_S3_BUCKET)}', 'error': '{str(create_err)}'}}",
f"{{'bucket': '{_mask(settings.AWS_S3_BUCKET)}', 'error': '{str(create_err)}'}}",
exc_info=True,
)
raise CloudStorageError(create_err) from create_err
Expand Down Expand Up @@ -168,12 +179,12 @@ def put(self, source: UploadFile, file_path: Path) -> SimpleStorageName:
)
logger.info(
f"[AmazonCloudStorage.put] File uploaded successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(destination.Bucket)}', 'key': '{mask_string(destination.Key)}'}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(destination.Bucket)}', 'key': '{_mask(destination.Key)}'}}"
)
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.put] AWS upload error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(destination.Bucket)}', 'key': '{mask_string(destination.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(destination.Bucket)}', 'key': '{_mask(destination.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}"') from err
Expand All @@ -187,13 +198,13 @@ def stream(self, url: str) -> StreamingBody:
body = self.aws.client.get_object(**kwargs).get("Body")
logger.info(
f"[AmazonCloudStorage.stream] File streamed successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}'}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}'}}"
)
return body
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.stream] AWS stream error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
Expand All @@ -206,13 +217,13 @@ def get(self, url: str) -> bytes:
content = body.read()
logger.info(
f"[AmazonCloudStorage.get] File retrieved successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_bytes': {len(content)}}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'size_bytes': {len(content)}}}"
)
return content
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.get] AWS get error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
Expand All @@ -226,13 +237,13 @@ def get_file_size_kb(self, url: str) -> float:
size_kb = round(size_bytes / 1024, 2)
logger.info(
f"[AmazonCloudStorage.get_file_size_kb] File size retrieved successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_kb': {size_kb}}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'size_kb': {size_kb}}}"
)
return size_kb
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.get_file_size_kb] AWS head object error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
Expand All @@ -259,13 +270,13 @@ def get_signed_url(self, url: str, expires_in: int = 3600) -> str:
)
logger.info(
f"[AmazonCloudStorage.get_signed_url] Signed URL generated | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}'}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}'}}"
)
return signed_url
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.get_signed_url] AWS presign error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
Expand All @@ -277,12 +288,12 @@ def delete(self, url: str) -> None:
self.aws.client.delete_object(**kwargs)
logger.info(
f"[AmazonCloudStorage.delete] File deleted successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}'}}"
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}'}}"
)
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.delete] AWS delete error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
f"{{'project_id': '{self.project_id}', 'bucket': '{_mask(name.Bucket)}', 'key': '{_mask(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
Expand All @@ -292,6 +303,11 @@ def get_cloud_storage(session: Session, project_id: int) -> CloudStorage:
"""
Method to create and configure a cloud storage instance.
"""
# Lazy import to avoid a top-level cycle: storage.py is imported from
# app.services.llm.providers.gai_vertex, which itself is wired into the
# provider registry that app.crud transitively pulls in.
from app.crud import get_project_by_id

project = get_project_by_id(session=session, project_id=project_id)
if not project:
raise ValueError(f"Invalid project_id: {project_id}")
Expand All @@ -306,3 +322,103 @@ def get_cloud_storage(session: Session, project_id: int) -> CloudStorage:
exc_info=True,
)
raise


GCS_SCOPES = ("https://www.googleapis.com/auth/cloud-platform",)

MAX_AUDIO_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB

_MIME_TO_EXT = {
"audio/wav": ".wav",
"audio/mpeg": ".mp3",
"audio/mp3": ".mp3",
"audio/ogg": ".ogg",
"audio/flac": ".flac",
"audio/webm": ".webm",
"audio/aac": ".aac",
"audio/aiff": ".aiff",
}


def upload_audio_to_gcs(
*,
bucket_name: str,
sa_info: dict,
audio_bytes: bytes | None = None,
local_path: str | None = None,
content_type: str | None = None,
project_id: str | None = None,
key_prefix: str = "audio",
) -> str:
"""Upload audio to GCS and return its ``gs://bucket/key`` URI.

Pass exactly one of ``audio_bytes`` or ``local_path``.

BYOK: caller supplies ``sa_info`` and ``bucket_name``. The returned URI
plugs directly into Vertex ``fileData.fileUri``.
"""
if (audio_bytes is None) == (local_path is None):
raise ValueError("Pass exactly one of audio_bytes or local_path")

if local_path is not None:
if not os.path.isfile(local_path):
raise FileNotFoundError(f"Audio file not found: {local_path}")
size = os.path.getsize(local_path)
ext = Path(local_path).suffix or ""
mime = content_type or mimetypes.guess_type(local_path)[0] or "audio/wav"
else:
if not audio_bytes:
raise ValueError("audio_bytes is empty")
size = len(audio_bytes)
mime = content_type or "audio/wav"
ext = _MIME_TO_EXT.get(mime, "")

if mime not in _MIME_TO_EXT:
raise ValueError(
f"Unsupported content_type '{mime}'. Allowed: "
f"{', '.join(sorted(_MIME_TO_EXT))}"
)

# Sniff the actual bytes — content_type is caller-supplied and spoofable.
sniff_source = audio_bytes if audio_bytes is not None else local_path
detected = filetype.guess(sniff_source)
if detected is None or not detected.mime.startswith("audio/"):
raise ValueError(
f"Uploaded content is not a recognised audio file "
f"(detected={detected.mime if detected else 'unknown'})"
)

if size > MAX_AUDIO_UPLOAD_BYTES:
raise ValueError(
f"Audio exceeds {MAX_AUDIO_UPLOAD_BYTES // (1024 * 1024)} MB limit "
f"(got {size / (1024 * 1024):.1f} MB)"
)

key = f"{key_prefix}/{uuid4().hex}{ext}"

try:
creds = service_account.Credentials.from_service_account_info(
sa_info, scopes=list(GCS_SCOPES)
)
client = gcs.Client(
project=project_id or sa_info.get("project_id"), credentials=creds
)
blob = client.bucket(bucket_name).blob(key)
if local_path is not None:
blob.upload_from_filename(local_path, content_type=mime)
else:
blob.upload_from_string(audio_bytes, content_type=mime)
except Exception as e:
logger.error(
f"[upload_audio_to_gcs] Upload failed | "
f"bucket={bucket_name}, key={key}, error={e}",
exc_info=True,
)
raise CloudStorageError(f"GCS upload failed: {e}") from e

uri = f"gs://{bucket_name}/{key}"
logger.info(
f"[upload_audio_to_gcs] Uploaded | "
f"uri={uri}, mime={mime}, size_kb={size / 1024:.1f}"
Comment on lines +412 to +422
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Mask the staged GCS identifiers in these logs.

Lines 387-397 log raw bucket, key, and uri values for uploaded audio, which leaks storage metadata for user media into both error and info logs. This file already introduced _mask() for the S3 paths, so the GCS helper should apply the same masking before logging. As per coding guidelines, "Prefix all log messages with the function name in square brackets: logger.info(f"[function_name] Message {mask_string(sensitive_value)}")".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/core/cloud/storage.py` around lines 387 - 397, The logs in
upload_audio_to_gcs currently emit raw bucket, key and uri values; update the
logger.error and logger.info calls to mask those identifiers using the existing
_mask() helper (e.g., use _mask(bucket_name), _mask(key)) and construct a masked
uri like f"gs://{_mask(bucket_name)}/{_mask(key)}" before logging, preserving
exc_info=True on errors and keeping the existing "[upload_audio_to_gcs]" prefix
in each message.

)
return uri
11 changes: 11 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ def SQLALCHEMY_DATABASE_URI(self) -> PostgresDsn:
AWS_DEFAULT_REGION: str = ""
AWS_S3_BUCKET_PREFIX: str = ""

# GCP Vertex AI platform defaults. Used when a project does not register
# its own google-vertex credential row (BYOK is all-or-nothing — see the
# Provider.GOOGLE_VERTEX comment in app/core/providers.py).
GCP_VERTEX_API_KEY: str = ""
GCP_VERTEX_LOCATION: str = ""
GCP_PROJECT_ID: str = ""
# Filesystem path to the platform-default GCP service-account JSON.
# Used by the registry fallback when a project has no google-vertex row.
GCP_SA_KEY: str = ""
GCS_AUDIO_BUCKET: str = ""

# RabbitMQ configuration for Celery broker
RABBITMQ_HOST: str = "localhost"
RABBITMQ_PORT: int = 5672
Expand Down
Loading
Loading