Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: ContraLegal CI/CD

on:
push:
branches: [ "main", "master" ]
pull_request:
branches: [ "main", "master" ]

permissions:
contents: read

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: "pip"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install flake8

- name: Lint with flake8 (fail on real errors only)
run: |
# Critical errors (fail build)
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# Style issues (warnings only)
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

- name: Check Python syntax
run: python -m compileall .

- name: Smoke Test - Core Imports
run: |
python - << 'EOF'
import sys
try:
# Test core modules load correctly
import src.inference.predictor
from src.utils.pdf_annotator import highlight_contract_risks
print("Core modules loaded successfully")
except Exception as e:
print(f"Import failed: {e}")
sys.exit(1)
EOF

- name: Smoke Test - Model Presence (non-blocking)
run: |
if [ -f models/model.pkl ]; then
echo "Model file found. Smoke test passed."
else
echo "Warning: Model file missing. Expected during CI (not committed)."
exit 0
fi
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ torch>=2.0.0
transformers>=4.36.0
accelerate>=0.25.0


# UI
streamlit>=1.30.0
plotly>=5.18.0
Expand All @@ -36,3 +35,7 @@ langchain-groq>=0.2.0
langchain-huggingface>=1.0.0
sentence-transformers>=3.0.0
faiss-cpu>=1.8.0


langchain-text-splitters
python-dotenv
19 changes: 8 additions & 11 deletions src/data_pipeline/clause_segment.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
from typing import List
from src.utils.spacy_loader import load_spacy_model
from langchain.text_splitter import RecursiveCharacterTextSplitter

class ClauseSegmenter:

def __init__(self):
self.nlp = load_spacy_model()
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=150,
separators=["\n\n", "\n", ". ", " ", ""]
)

def segment(self, text: str) -> List[str]:
if not text:
return []

doc= self.nlp(text)
valid_clauses = [
sent.text.strip() for sent in doc.sents
if len(sent.text.strip().split()) > 3
]

return valid_clauses
chunks = self.splitter.split_text(text)
return [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
35 changes: 35 additions & 0 deletions src/utils/pdf_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import fitz # PyMuPDF

def highlight_contract_risks(input_path: str, output_path: str, flagged_clauses: list):

doc = fitz.open(input_path) #opening the file

for item in flagged_clauses:
clause_text = item['text']
risk_level = item['risk']

color = (1, 0, 0) if "High" in risk_level else (1, 0.8, 0) # Define colors (RGB format 0.0 to 1.0) Red for High Risk, Yellow for Medium

for page in doc:
page_text = page.get_text()

if clause_text[:50].lower() in page_text.lower(): # quick filter
text_instances = page.search_for(clause_text[:100])

seen = set()
for inst in text_instances:
key = (inst.x0, inst.y0, inst.x1, inst.y1)

if key in seen:
continue

seen.add(key)

annot = page.add_highlight_annot(inst)
annot.set_colors(stroke=color)
annot.update()

# here save the new PDF
doc.save(output_path, garbage=4, deflate=True)
doc.close()
return output_path
Loading