diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..4f75eef --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,63 @@ +name: ContraLegal CI/CD + +on: + push: + branches: [ "main", "master" ] + pull_request: + branches: [ "main", "master" ] + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install flake8 + + - name: Lint with flake8 (fail on real errors only) + run: | + # Critical errors (fail build) + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # Style issues (warnings only) + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Check Python syntax + run: python -m compileall . + + - name: Smoke Test - Core Imports + run: | + python - << 'EOF' + import sys + try: + # Test core modules load correctly + import src.inference.predictor + from src.utils.pdf_annotator import highlight_contract_risks + print("Core modules loaded successfully") + except Exception as e: + print(f"Import failed: {e}") + sys.exit(1) + EOF + + - name: Smoke Test - Model Presence (non-blocking) + run: | + if [ -f models/model.pkl ]; then + echo "Model file found. Smoke test passed." + else + echo "Warning: Model file missing. Expected during CI (not committed)." + exit 0 + fi \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bfcff78..6e5c742 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,6 @@ torch>=2.0.0 transformers>=4.36.0 accelerate>=0.25.0 - # UI streamlit>=1.30.0 plotly>=5.18.0 @@ -36,3 +35,7 @@ langchain-groq>=0.2.0 langchain-huggingface>=1.0.0 sentence-transformers>=3.0.0 faiss-cpu>=1.8.0 + + +langchain-text-splitters +python-dotenv \ No newline at end of file diff --git a/src/data_pipeline/clause_segment.py b/src/data_pipeline/clause_segment.py index ce0f81f..a37f4b6 100644 --- a/src/data_pipeline/clause_segment.py +++ b/src/data_pipeline/clause_segment.py @@ -1,19 +1,16 @@ from typing import List -from src.utils.spacy_loader import load_spacy_model +from langchain.text_splitter import RecursiveCharacterTextSplitter class ClauseSegmenter: - def __init__(self): - self.nlp = load_spacy_model() + self.splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=150, + separators=["\n\n", "\n", ". ", " ", ""] + ) def segment(self, text: str) -> List[str]: if not text: return [] - - doc= self.nlp(text) - valid_clauses = [ - sent.text.strip() for sent in doc.sents - if len(sent.text.strip().split()) > 3 - ] - - return valid_clauses \ No newline at end of file + chunks = self.splitter.split_text(text) + return [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5] \ No newline at end of file diff --git a/src/utils/pdf_annotator.py b/src/utils/pdf_annotator.py new file mode 100644 index 0000000..2da2685 --- /dev/null +++ b/src/utils/pdf_annotator.py @@ -0,0 +1,35 @@ +import fitz # PyMuPDF + +def highlight_contract_risks(input_path: str, output_path: str, flagged_clauses: list): + + doc = fitz.open(input_path) #opening the file + + for item in flagged_clauses: + clause_text = item['text'] + risk_level = item['risk'] + + color = (1, 0, 0) if "High" in risk_level else (1, 0.8, 0) # Define colors (RGB format 0.0 to 1.0) Red for High Risk, Yellow for Medium + + for page in doc: + page_text = page.get_text() + + if clause_text[:50].lower() in page_text.lower(): # quick filter + text_instances = page.search_for(clause_text[:100]) + + seen = set() + for inst in text_instances: + key = (inst.x0, inst.y0, inst.x1, inst.y1) + + if key in seen: + continue + + seen.add(key) + + annot = page.add_highlight_annot(inst) + annot.set_colors(stroke=color) + annot.update() + + # here save the new PDF + doc.save(output_path, garbage=4, deflate=True) + doc.close() + return output_path \ No newline at end of file