AyushCoder9 · Ishiezz · Apr 18, 2026 · Apr 9, 2026 · Apr 11, 2026 · Apr 12, 2026
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,63 @@
+name: ContraLegal CI/CD
+
+on:
+  push:
+    branches: [ "main", "master" ]
+  pull_request:
+    branches: [ "main", "master" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+        cache: "pip"
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install flake8
+
+    - name: Lint with flake8 (fail on real errors only)
+      run: |
+        # Critical errors (fail build)
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # Style issues (warnings only)
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
+    - name: Check Python syntax
+      run: python -m compileall .
+
+    - name: Smoke Test - Core Imports
+      run: |
+        python - << 'EOF'
+        import sys
+        try:
+            # Test core modules load correctly
+            import src.inference.predictor
+            from src.utils.pdf_annotator import highlight_contract_risks
+            print("Core modules loaded successfully")
+        except Exception as e:
+            print(f"Import failed: {e}")
+            sys.exit(1)
+        EOF
+
+    - name: Smoke Test - Model Presence (non-blocking)
+      run: |
+        if [ -f models/model.pkl ]; then
+          echo "Model file found. Smoke test passed."
+        else
+          echo "Warning: Model file missing. Expected during CI (not committed)."
+          exit 0
+        fi
diff --git a/requirements.txt b/requirements.txt
@@ -16,7 +16,6 @@ torch>=2.0.0
 transformers>=4.36.0
 accelerate>=0.25.0
 
-
 # UI
 streamlit>=1.30.0
 plotly>=5.18.0
@@ -36,3 +35,7 @@ langchain-groq>=0.2.0
 langchain-huggingface>=1.0.0
 sentence-transformers>=3.0.0
 faiss-cpu>=1.8.0
+
+
+langchain-text-splitters
+python-dotenv
diff --git a/src/data_pipeline/clause_segment.py b/src/data_pipeline/clause_segment.py
@@ -1,19 +1,16 @@
 from typing import List
-from src.utils.spacy_loader import load_spacy_model
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 class ClauseSegmenter:
-
     def __init__(self):
-        self.nlp = load_spacy_model()
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=150,
+            separators=["\n\n", "\n", ". ", " ", ""]
+        )
 
     def segment(self, text: str) -> List[str]:
         if not text:
             return []
-
-        doc= self.nlp(text)
-        valid_clauses = [
-            sent.text.strip() for sent in doc.sents 
-            if len(sent.text.strip().split()) > 3
-        ]
-
-        return valid_clauses
+        chunks = self.splitter.split_text(text)
+        return [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
diff --git a/src/utils/pdf_annotator.py b/src/utils/pdf_annotator.py
@@ -0,0 +1,35 @@
+import fitz  # PyMuPDF
+
+def highlight_contract_risks(input_path: str, output_path: str, flagged_clauses: list):
+
+    doc = fitz.open(input_path) #opening the file 
+
+    for item in flagged_clauses:
+        clause_text = item['text']
+        risk_level = item['risk']
+
+        color = (1, 0, 0) if "High" in risk_level else (1, 0.8, 0)  # Define colors (RGB format 0.0 to 1.0) Red for High Risk, Yellow for Medium
+
+        for page in doc:
+            page_text = page.get_text()
+
+            if clause_text[:50].lower() in page_text.lower():   # quick filter
+                text_instances = page.search_for(clause_text[:100])
+
+                seen = set()
+                for inst in text_instances:
+                    key = (inst.x0, inst.y0, inst.x1, inst.y1)
+
+                    if key in seen:
+                        continue
+
+                    seen.add(key)
+
+                    annot = page.add_highlight_annot(inst)
+                    annot.set_colors(stroke=color)
+                    annot.update()
+
+    # here save the new PDF
+    doc.save(output_path, garbage=4, deflate=True) 
+    doc.close()
+    return output_path