Coargus · minkyu-choi07 · May 1, 2025 · May 1, 2025
diff --git a/cvias/image/detection/cvias_detection.py b/cvias/image/detection/cvias_detection.py
@@ -41,7 +41,7 @@ def calibrate_model_with_conformal_prediction(
     def calibrate_confidence(
         self,
         confidence: float,
-        calibration_func: callable[[float], float] | None = None,
+        calibration_func: callable | None = None,
     ) -> float:
         """Calibrate confidence score."""
         if self.calibration_method == "conformal_prediction":
@@ -55,5 +55,3 @@ def calibrate_confidence(
             )
 
         return calibration_func(confidence)
-
-        return 0.0
diff --git a/cvias/image/detection/mmdetection.py → cvias/image/detection/mm_detection.py b/cvias/image/detection/mmdetection.py → cvias/image/detection/mm_detection.py
diff --git a/cvias/image/detection/vllm_detection.py b/cvias/image/detection/vllm_detection.py
@@ -0,0 +1,258 @@
+"""VLLM Detection Model."""
+
+import base64
+import logging
+
+import cv2
+import numpy as np
+from cog_cv_abstraction.schema.detected_object import DetectedObject
+from neus_v.calibration.vlm import smooth_mapping
+from openai import OpenAI
+
+from cvias.image.detection import CviasDetectionModel
+
+
+class VLLMDetection(CviasDetectionModel):
+    """VLLM Detection Model."""
+
+    def __init__(
+        self,
+        api_key: str = "EMPTY",
+        api_base: str = "http://localhost:8000/v1",
+        model: str = "OpenGVLab/InternVL2_5-8B",
+        calibration_method: str | None = None,
+    ) -> None:
+        """Initialize VLLM Detection Model.
+
+        Args:
+            api_key (str): The API key for the VLLM Detection Model.
+            api_base (str): The API base for the VLLM Detection Model.
+            model (str): The model for the VLLM Detection Model.
+            calibration_method (str | None): The calibration method for the VLLM Detection Model.
+        """  # noqa: E501
+        super().__init__(calibration_method=calibration_method)
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
+        self.model = model
+        self.system_message = (
+            "You must only return a Yes or No, and not both, to any question asked.\n"  # noqa: E501
+            "You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n"  # noqa: E501
+            "For example, if the question is 'Is there a cat present in the Imag    e?', the answer must only be 'Yes' or 'No'."  # noqa: E501
+        )
+
+    def _encode_frame(self, frame: np.ndarray) -> str:
+        """Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
+
+        Args:
+            frame (np.ndarray): The frame image to encode.
+
+        Returns:
+            str: The base64 encoded frame image.
+        """  # noqa: E501
+        ret, buffer = cv2.imencode(".jpg", frame)
+        if not ret:
+            msg = "Could not encode frame"
+            logging.error(msg)
+            raise ValueError(msg)
+        return base64.b64encode(buffer).decode("utf-8")
+
+    def detect(
+        self,
+        frame_img: np.ndarray | None = None,
+        classes: list[np.ndarray] | None = None,
+        threshold: float | None = None,
+    ) -> DetectedObject:
+        """Detect the scene description in the frame or sequence of frames.
+
+        Args:
+            frame_img (np.ndarray | None): The frame image to detect.
+            classes (list[np.ndarray] | None): The classes to detect.
+            threshold (float | None): The threshold to use for the detection.
+
+        Returns:
+            DetectedObject: The detected object.
+        """
+        seq_of_frames = [frame_img]
+        scene_description = classes[0]
+
+        # Encode each frame.
+        encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]
+
+        # Build the user message: a text prompt plus one image for each frame.
+        user_content = [
+            {
+                "type": "text",
+                "text": f"Does the sequence of these images depict '{scene_description}'",  # noqa: E501
+            }
+        ]
+        for encoded in encoded_images:
+            user_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                }
+            )
+
+        # Create a chat completion request.
+        chat_response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": self.system_message},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1,
+            temperature=0.0,
+            logprobs=True,
+            top_logprobs=20,
+        )
+
+        # Retrieve the list of TopLogprob objects.
+        top_logprobs_list = (
+            chat_response.choices[0].logprobs.content[0].top_logprobs
+        )
+
+        # Build a mapping from token text (stripped) to its probability.
+        token_prob_map = {}
+        for top_logprob in top_logprobs_list:
+            token_text = top_logprob.token.strip()
+            token_prob_map[token_text] = np.exp(top_logprob.logprob)
+
+        # Extract probabilities for "Yes" and "No"
+        yes_prob = token_prob_map.get("Yes", 0.0)
+        no_prob = token_prob_map.get("No", 0.0)
+
+        # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
+        if yes_prob + no_prob > 0:
+            confidence = yes_prob / (yes_prob + no_prob)
+        else:
+            msg = "No probabilities for 'Yes' or 'No' found in the response."
+            logging.error(msg)
+            raise ValueError(msg)
+
+        if threshold:
+            confidence = smooth_mapping(
+                confidence=confidence, false_threshold=threshold
+            )
+            if confidence < threshold:
+                no_prob = 1.0
+
+        detected_object = DetectedObject(
+            name=scene_description,
+            model_name=self.model,
+            confidence_of_all_obj=[round(confidence, 3)],
+            probability_of_all_obj=[],
+            number_of_detection=1,
+            is_detected=yes_prob > no_prob,  # TODO: Check if this is correct
+        )
+        if self.calibration_method:
+            # calibrate confidence score
+            if "internvl2" not in self.model.lower():
+                logging.warning(
+                    "Temperature scaling calibration is only supported for InternVL models."  # noqa: E501
+                )
+                return self.no_calibration(detected_object)
+            return self.calibrate(detected_object)
+
+        return self.no_calibration(detected_object)
+
+    def no_calibration(self, detected_object: DetectedObject) -> DetectedObject:
+        """No calibration.
+
+        Args:
+            detected_object (DetectedObject): Detected object.
+        """
+        probabilities = []
+        if detected_object.is_detected:
+            for confidence in detected_object.confidence_of_all_obj:
+                probabilities.append(confidence)
+            detected_object.probability_of_all_obj = probabilities
+            detected_object.probability = max(probabilities)
+        return detected_object
+
+    def calibrate(self, detected_object: DetectedObject) -> DetectedObject:
+        """Calibrate detection results.
+
+        Args:
+            detected_object (DetectedObject): Detected object.
+
+        Returns:
+            DetectedObject: Calibrated detected object.
+        """
+        probabilities = []
+        if detected_object.is_detected:
+            for confidence in detected_object.confidence_of_all_obj:
+                probabilities.append(
+                    self.calibrate_confidence(
+                        confidence=confidence,
+                        calibration_func=self.calibrate_function,
+                    )
+                )
+            detected_object.probability_of_all_obj = probabilities
+            detected_object.probability = max(probabilities)
+        return detected_object
+
+    def calibrate_function(
+        self,
+        confidence: float,
+        true_threshold: float = 0.95,
+        false_threshold: float = 0.40,
+        target_conf: float = 0.60,
+        target_prob: float = 0.78,
+        p_min: float = 0.01,
+        p_max: float = 0.99,
+        steepness_factor: float = 0.7,  # New parameter: 0-1 range, lower = less steep # noqa: E501
+    ) -> float:  # no[]
+        """Map confidence to probability using a sigmoid function with adjustable steepness.
+
+        Args:
+            confidence: Input confidence score
+            true_threshold: Upper threshold (0.78)
+            false_threshold: Lower threshold (0.40)
+            target_conf: Target confidence point (0.60)
+            target_prob: Target probability value (0.78)
+            p_min: Minimum probability (0.01)
+            p_max: Maximum probability (0.99)
+            steepness_factor: Controls curve steepness (0-1, lower = less steep)
+        """  # noqa: E501
+        if confidence <= false_threshold:
+            return p_min
+
+        if confidence >= true_threshold:
+            return p_max
+
+        # Calculate parameters to ensure target_conf maps to target_prob
+        # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))
+
+        # First, normalize the target point
+        x_norm = (target_conf - false_threshold) / (
+            true_threshold - false_threshold
+        )
+        y_norm = (target_prob - p_min) / (p_max - p_min)
+
+        # Find x0 (midpoint) and k (steepness) to satisfy our target point
+        x0 = 0.30  # Midpoint of normalized range
+
+        # Calculate base k value to hit the target point
+        base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)
+
+        # Apply steepness factor (lower = less steep)
+        k = base_k * steepness_factor
+
+        # With reduced steepness, we need to adjust x0 to still hit the target point # noqa: E501
+        # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
+        adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k
+
+        # Apply the sigmoid with our calculated parameters
+        x_scaled = (confidence - false_threshold) / (
+            true_threshold - false_threshold
+        )
+        sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))
+
+        # Ensure we still hit exactly p_min and p_max at the thresholds
+        # by rescaling the output slightly
+        min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
+        max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))
+
+        # Normalize the output
+        normalized = (sigmoid_value - min_val) / (max_val - min_val)
+
+        return p_min + normalized * (p_max - p_min)
diff --git a/cvias/vision_language/internvl/internvl.py b/cvias/vision_language/internvl/internvl.py
@@ -238,7 +238,7 @@ def infer_with_image_confidence(
             self.tokenizer, pixel_values, language, generation_config
         )
 
-    def chat_with_confidence(  # noqa: PLR0913
+    def chat_with_confidence(
         self,
         tokenizer: AutoTokenizer,
         pixel_values: torch.Tensor,

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ namespaces = true  # to disable scanning PEP 420 namespaces (true by default)
 
 [project]
 name = "cvias"
-version = "0.1.3-dev"
+version = "0.1.4-dev"
 authors = [
   { name = "Coargus Engineering", email = "engineering@coargus.com" },
 ]
@@ -22,6 +22,12 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
+    "torch==2.4.0",
+    "torchvision==0.19.0",
+    "torchaudio==2.4.0",
+    "opencv-python==4.10.0.84",
+    "transformers==4.47.0",
+    "vllm==0.6.3.post1",
     "cog_cv_abstraction@git+https://github.com/Coargus/computer-vision-model-abstraction.git@v0.0.5-dev",
     "cogutil@git+https://github.com/Coargus/cogutil.git@v0.0.1",
     "cogcvutil@git+https://github.com/Coargus/cogcvutil.git",

diff --git a/ruff.toml b/ruff.toml
@@ -82,7 +82,7 @@ select = [
     "RUF",
 ]
 
-ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004"]
+ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004","PLR0913"]
 
 
 # Allow fix for all enabled rules (when `--fix`) is provided.

diff --git a/scripts/start_vllm_server.sh b/scripts/start_vllm_server.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+MODEL="OpenGVLab/InternVL2_5-8B"
+# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+# export NCCL_P2P_DISABLE=1
+# export CUDA_VISIBLE_DEVICES="0"
+# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+PORT=8000
+vllm serve $MODEL \
+    --port $PORT \
+    --trust-remote-code \
+    --limit-mm-per-prompt image=4 \
+    --max-model-len 8192 \
+    --gpu-memory-utilization 0.97 \
+    --disable-log-requests
-Original file line number
+Diff line change
@@ Expand Up / @@ -82,7 +82,7 @@ select = [ @@
         "RUF",
     ]
-    ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004"]
+    ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004","PLR0913"]
     # Allow fix for all enabled rules (when `--fix`) is provided.
@@ Expand Down @@