diff --git a/cvias/image/detection/cvias_detection.py b/cvias/image/detection/cvias_detection.py index 6e82139..4f0bda0 100644 --- a/cvias/image/detection/cvias_detection.py +++ b/cvias/image/detection/cvias_detection.py @@ -41,7 +41,7 @@ def calibrate_model_with_conformal_prediction( def calibrate_confidence( self, confidence: float, - calibration_func: callable[[float], float] | None = None, + calibration_func: callable | None = None, ) -> float: """Calibrate confidence score.""" if self.calibration_method == "conformal_prediction": @@ -55,5 +55,3 @@ def calibrate_confidence( ) return calibration_func(confidence) - - return 0.0 diff --git a/cvias/image/detection/mmdetection.py b/cvias/image/detection/mm_detection.py similarity index 100% rename from cvias/image/detection/mmdetection.py rename to cvias/image/detection/mm_detection.py diff --git a/cvias/image/detection/vllm_detection.py b/cvias/image/detection/vllm_detection.py new file mode 100644 index 0000000..76ec6cb --- /dev/null +++ b/cvias/image/detection/vllm_detection.py @@ -0,0 +1,258 @@ +"""VLLM Detection Model.""" + +import base64 +import logging + +import cv2 +import numpy as np +from cog_cv_abstraction.schema.detected_object import DetectedObject +from neus_v.calibration.vlm import smooth_mapping +from openai import OpenAI + +from cvias.image.detection import CviasDetectionModel + + +class VLLMDetection(CviasDetectionModel): + """VLLM Detection Model.""" + + def __init__( + self, + api_key: str = "EMPTY", + api_base: str = "http://localhost:8000/v1", + model: str = "OpenGVLab/InternVL2_5-8B", + calibration_method: str | None = None, + ) -> None: + """Initialize VLLM Detection Model. + + Args: + api_key (str): The API key for the VLLM Detection Model. + api_base (str): The API base for the VLLM Detection Model. + model (str): The model for the VLLM Detection Model. + calibration_method (str | None): The calibration method for the VLLM Detection Model. + """ # noqa: E501 + super().__init__(calibration_method=calibration_method) + self.client = OpenAI(api_key=api_key, base_url=api_base) + self.model = model + self.system_message = ( + "You must only return a Yes or No, and not both, to any question asked.\n" # noqa: E501 + "You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n" # noqa: E501 + "For example, if the question is 'Is there a cat present in the Imag e?', the answer must only be 'Yes' or 'No'." # noqa: E501 + ) + + def _encode_frame(self, frame: np.ndarray) -> str: + """Encode a uint8 numpy array (image) as a JPEG and then base64 encode it. + + Args: + frame (np.ndarray): The frame image to encode. + + Returns: + str: The base64 encoded frame image. + """ # noqa: E501 + ret, buffer = cv2.imencode(".jpg", frame) + if not ret: + msg = "Could not encode frame" + logging.error(msg) + raise ValueError(msg) + return base64.b64encode(buffer).decode("utf-8") + + def detect( + self, + frame_img: np.ndarray | None = None, + classes: list[np.ndarray] | None = None, + threshold: float | None = None, + ) -> DetectedObject: + """Detect the scene description in the frame or sequence of frames. + + Args: + frame_img (np.ndarray | None): The frame image to detect. + classes (list[np.ndarray] | None): The classes to detect. + threshold (float | None): The threshold to use for the detection. + + Returns: + DetectedObject: The detected object. + """ + seq_of_frames = [frame_img] + scene_description = classes[0] + + # Encode each frame. + encoded_images = [self._encode_frame(frame) for frame in seq_of_frames] + + # Build the user message: a text prompt plus one image for each frame. + user_content = [ + { + "type": "text", + "text": f"Does the sequence of these images depict '{scene_description}'", # noqa: E501 + } + ] + for encoded in encoded_images: + user_content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}, + } + ) + + # Create a chat completion request. + chat_response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": self.system_message}, + {"role": "user", "content": user_content}, + ], + max_tokens=1, + temperature=0.0, + logprobs=True, + top_logprobs=20, + ) + + # Retrieve the list of TopLogprob objects. + top_logprobs_list = ( + chat_response.choices[0].logprobs.content[0].top_logprobs + ) + + # Build a mapping from token text (stripped) to its probability. + token_prob_map = {} + for top_logprob in top_logprobs_list: + token_text = top_logprob.token.strip() + token_prob_map[token_text] = np.exp(top_logprob.logprob) + + # Extract probabilities for "Yes" and "No" + yes_prob = token_prob_map.get("Yes", 0.0) + no_prob = token_prob_map.get("No", 0.0) + + # Compute the normalized probability for "Yes": p_yes / (p_yes + p_no) + if yes_prob + no_prob > 0: + confidence = yes_prob / (yes_prob + no_prob) + else: + msg = "No probabilities for 'Yes' or 'No' found in the response." + logging.error(msg) + raise ValueError(msg) + + if threshold: + confidence = smooth_mapping( + confidence=confidence, false_threshold=threshold + ) + if confidence < threshold: + no_prob = 1.0 + + detected_object = DetectedObject( + name=scene_description, + model_name=self.model, + confidence_of_all_obj=[round(confidence, 3)], + probability_of_all_obj=[], + number_of_detection=1, + is_detected=yes_prob > no_prob, # TODO: Check if this is correct + ) + if self.calibration_method: + # calibrate confidence score + if "internvl2" not in self.model.lower(): + logging.warning( + "Temperature scaling calibration is only supported for InternVL models." # noqa: E501 + ) + return self.no_calibration(detected_object) + return self.calibrate(detected_object) + + return self.no_calibration(detected_object) + + def no_calibration(self, detected_object: DetectedObject) -> DetectedObject: + """No calibration. + + Args: + detected_object (DetectedObject): Detected object. + """ + probabilities = [] + if detected_object.is_detected: + for confidence in detected_object.confidence_of_all_obj: + probabilities.append(confidence) + detected_object.probability_of_all_obj = probabilities + detected_object.probability = max(probabilities) + return detected_object + + def calibrate(self, detected_object: DetectedObject) -> DetectedObject: + """Calibrate detection results. + + Args: + detected_object (DetectedObject): Detected object. + + Returns: + DetectedObject: Calibrated detected object. + """ + probabilities = [] + if detected_object.is_detected: + for confidence in detected_object.confidence_of_all_obj: + probabilities.append( + self.calibrate_confidence( + confidence=confidence, + calibration_func=self.calibrate_function, + ) + ) + detected_object.probability_of_all_obj = probabilities + detected_object.probability = max(probabilities) + return detected_object + + def calibrate_function( + self, + confidence: float, + true_threshold: float = 0.95, + false_threshold: float = 0.40, + target_conf: float = 0.60, + target_prob: float = 0.78, + p_min: float = 0.01, + p_max: float = 0.99, + steepness_factor: float = 0.7, # New parameter: 0-1 range, lower = less steep # noqa: E501 + ) -> float: # no[] + """Map confidence to probability using a sigmoid function with adjustable steepness. + + Args: + confidence: Input confidence score + true_threshold: Upper threshold (0.78) + false_threshold: Lower threshold (0.40) + target_conf: Target confidence point (0.60) + target_prob: Target probability value (0.78) + p_min: Minimum probability (0.01) + p_max: Maximum probability (0.99) + steepness_factor: Controls curve steepness (0-1, lower = less steep) + """ # noqa: E501 + if confidence <= false_threshold: + return p_min + + if confidence >= true_threshold: + return p_max + + # Calculate parameters to ensure target_conf maps to target_prob + # For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0))) + + # First, normalize the target point + x_norm = (target_conf - false_threshold) / ( + true_threshold - false_threshold + ) + y_norm = (target_prob - p_min) / (p_max - p_min) + + # Find x0 (midpoint) and k (steepness) to satisfy our target point + x0 = 0.30 # Midpoint of normalized range + + # Calculate base k value to hit the target point + base_k = -np.log(1 / y_norm - 1) / (x_norm - x0) + + # Apply steepness factor (lower = less steep) + k = base_k * steepness_factor + + # With reduced steepness, we need to adjust x0 to still hit the target point # noqa: E501 + # Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k + adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k + + # Apply the sigmoid with our calculated parameters + x_scaled = (confidence - false_threshold) / ( + true_threshold - false_threshold + ) + sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0))) + + # Ensure we still hit exactly p_min and p_max at the thresholds + # by rescaling the output slightly + min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0))) + max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0))) + + # Normalize the output + normalized = (sigmoid_value - min_val) / (max_val - min_val) + + return p_min + normalized * (p_max - p_min) diff --git a/cvias/vision_language/internvl/internvl.py b/cvias/vision_language/internvl/internvl.py index 39e9f9d..67a6a87 100644 --- a/cvias/vision_language/internvl/internvl.py +++ b/cvias/vision_language/internvl/internvl.py @@ -238,7 +238,7 @@ def infer_with_image_confidence( self.tokenizer, pixel_values, language, generation_config ) - def chat_with_confidence( # noqa: PLR0913 + def chat_with_confidence( self, tokenizer: AutoTokenizer, pixel_values: torch.Tensor, diff --git a/pyproject.toml b/pyproject.toml index 90e29aa..d9957b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ namespaces = true # to disable scanning PEP 420 namespaces (true by default) [project] name = "cvias" -version = "0.1.3-dev" +version = "0.1.4-dev" authors = [ { name = "Coargus Engineering", email = "engineering@coargus.com" }, ] @@ -22,6 +22,12 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ + "torch==2.4.0", + "torchvision==0.19.0", + "torchaudio==2.4.0", + "opencv-python==4.10.0.84", + "transformers==4.47.0", + "vllm==0.6.3.post1", "cog_cv_abstraction@git+https://github.com/Coargus/computer-vision-model-abstraction.git@v0.0.5-dev", "cogutil@git+https://github.com/Coargus/cogutil.git@v0.0.1", "cogcvutil@git+https://github.com/Coargus/cogcvutil.git", diff --git a/ruff.toml b/ruff.toml index 71f60f7..e9b01da 100644 --- a/ruff.toml +++ b/ruff.toml @@ -82,7 +82,7 @@ select = [ "RUF", ] -ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004"] +ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004","PLR0913"] # Allow fix for all enabled rules (when `--fix`) is provided. diff --git a/scripts/start_vllm_server.sh b/scripts/start_vllm_server.sh new file mode 100644 index 0000000..00eebd5 --- /dev/null +++ b/scripts/start_vllm_server.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +MODEL="OpenGVLab/InternVL2_5-8B" +# export CUDA_DEVICE_ORDER="PCI_BUS_ID" +# export NCCL_P2P_DISABLE=1 +# export CUDA_VISIBLE_DEVICES="0" +# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +PORT=8000 +vllm serve $MODEL \ + --port $PORT \ + --trust-remote-code \ + --limit-mm-per-prompt image=4 \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.97 \ + --disable-log-requests \ No newline at end of file