Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions cvias/image/detection/cvias_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def calibrate_model_with_conformal_prediction(
def calibrate_confidence(
self,
confidence: float,
calibration_func: callable[[float], float] | None = None,
calibration_func: callable | None = None,
) -> float:
"""Calibrate confidence score."""
if self.calibration_method == "conformal_prediction":
Expand All @@ -55,5 +55,3 @@ def calibrate_confidence(
)

return calibration_func(confidence)

return 0.0
258 changes: 258 additions & 0 deletions cvias/image/detection/vllm_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
"""VLLM Detection Model."""

import base64
import logging

import cv2
import numpy as np
from cog_cv_abstraction.schema.detected_object import DetectedObject
from neus_v.calibration.vlm import smooth_mapping
from openai import OpenAI

from cvias.image.detection import CviasDetectionModel


class VLLMDetection(CviasDetectionModel):
"""VLLM Detection Model."""

def __init__(
self,
api_key: str = "EMPTY",
api_base: str = "http://localhost:8000/v1",
model: str = "OpenGVLab/InternVL2_5-8B",
calibration_method: str | None = None,
) -> None:
"""Initialize VLLM Detection Model.

Args:
api_key (str): The API key for the VLLM Detection Model.
api_base (str): The API base for the VLLM Detection Model.
model (str): The model for the VLLM Detection Model.
calibration_method (str | None): The calibration method for the VLLM Detection Model.
""" # noqa: E501
super().__init__(calibration_method=calibration_method)
self.client = OpenAI(api_key=api_key, base_url=api_base)
self.model = model
self.system_message = (
"You must only return a Yes or No, and not both, to any question asked.\n" # noqa: E501
"You must not include any other symbols, information, text, justification in your answer or repeat Yes or No multiple times.\n" # noqa: E501
"For example, if the question is 'Is there a cat present in the Imag e?', the answer must only be 'Yes' or 'No'." # noqa: E501
)

def _encode_frame(self, frame: np.ndarray) -> str:
"""Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.

Args:
frame (np.ndarray): The frame image to encode.

Returns:
str: The base64 encoded frame image.
""" # noqa: E501
ret, buffer = cv2.imencode(".jpg", frame)
if not ret:
msg = "Could not encode frame"
logging.error(msg)
raise ValueError(msg)
return base64.b64encode(buffer).decode("utf-8")

def detect(
self,
frame_img: np.ndarray | None = None,
classes: list[np.ndarray] | None = None,
threshold: float | None = None,
) -> DetectedObject:
"""Detect the scene description in the frame or sequence of frames.

Args:
frame_img (np.ndarray | None): The frame image to detect.
classes (list[np.ndarray] | None): The classes to detect.
threshold (float | None): The threshold to use for the detection.

Returns:
DetectedObject: The detected object.
"""
seq_of_frames = [frame_img]
scene_description = classes[0]

# Encode each frame.
encoded_images = [self._encode_frame(frame) for frame in seq_of_frames]

# Build the user message: a text prompt plus one image for each frame.
user_content = [
{
"type": "text",
"text": f"Does the sequence of these images depict '{scene_description}'", # noqa: E501
}
]
for encoded in encoded_images:
user_content.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
}
)

# Create a chat completion request.
chat_response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": self.system_message},
{"role": "user", "content": user_content},
],
max_tokens=1,
temperature=0.0,
logprobs=True,
top_logprobs=20,
)

# Retrieve the list of TopLogprob objects.
top_logprobs_list = (
chat_response.choices[0].logprobs.content[0].top_logprobs
)

# Build a mapping from token text (stripped) to its probability.
token_prob_map = {}
for top_logprob in top_logprobs_list:
token_text = top_logprob.token.strip()
token_prob_map[token_text] = np.exp(top_logprob.logprob)

# Extract probabilities for "Yes" and "No"
yes_prob = token_prob_map.get("Yes", 0.0)
no_prob = token_prob_map.get("No", 0.0)

# Compute the normalized probability for "Yes": p_yes / (p_yes + p_no)
if yes_prob + no_prob > 0:
confidence = yes_prob / (yes_prob + no_prob)
else:
msg = "No probabilities for 'Yes' or 'No' found in the response."
logging.error(msg)
raise ValueError(msg)

if threshold:
confidence = smooth_mapping(
confidence=confidence, false_threshold=threshold
)
if confidence < threshold:
no_prob = 1.0

detected_object = DetectedObject(
name=scene_description,
model_name=self.model,
confidence_of_all_obj=[round(confidence, 3)],
probability_of_all_obj=[],
number_of_detection=1,
is_detected=yes_prob > no_prob, # TODO: Check if this is correct
)
if self.calibration_method:
# calibrate confidence score
if "internvl2" not in self.model.lower():
logging.warning(
"Temperature scaling calibration is only supported for InternVL models." # noqa: E501
)
return self.no_calibration(detected_object)
return self.calibrate(detected_object)

return self.no_calibration(detected_object)

def no_calibration(self, detected_object: DetectedObject) -> DetectedObject:
"""No calibration.

Args:
detected_object (DetectedObject): Detected object.
"""
probabilities = []
if detected_object.is_detected:
for confidence in detected_object.confidence_of_all_obj:
probabilities.append(confidence)
detected_object.probability_of_all_obj = probabilities
detected_object.probability = max(probabilities)
return detected_object

def calibrate(self, detected_object: DetectedObject) -> DetectedObject:
"""Calibrate detection results.

Args:
detected_object (DetectedObject): Detected object.

Returns:
DetectedObject: Calibrated detected object.
"""
probabilities = []
if detected_object.is_detected:
for confidence in detected_object.confidence_of_all_obj:
probabilities.append(
self.calibrate_confidence(
confidence=confidence,
calibration_func=self.calibrate_function,
)
)
detected_object.probability_of_all_obj = probabilities
detected_object.probability = max(probabilities)
return detected_object

def calibrate_function(
self,
confidence: float,
true_threshold: float = 0.95,
false_threshold: float = 0.40,
target_conf: float = 0.60,
target_prob: float = 0.78,
p_min: float = 0.01,
p_max: float = 0.99,
steepness_factor: float = 0.7, # New parameter: 0-1 range, lower = less steep # noqa: E501
) -> float: # no[]
"""Map confidence to probability using a sigmoid function with adjustable steepness.

Args:
confidence: Input confidence score
true_threshold: Upper threshold (0.78)
false_threshold: Lower threshold (0.40)
target_conf: Target confidence point (0.60)
target_prob: Target probability value (0.78)
p_min: Minimum probability (0.01)
p_max: Maximum probability (0.99)
steepness_factor: Controls curve steepness (0-1, lower = less steep)
""" # noqa: E501
if confidence <= false_threshold:
return p_min

if confidence >= true_threshold:
return p_max

# Calculate parameters to ensure target_conf maps to target_prob
# For a sigmoid function: f(x) = L / (1 + e^(-k(x-x0)))

# First, normalize the target point
x_norm = (target_conf - false_threshold) / (
true_threshold - false_threshold
)
y_norm = (target_prob - p_min) / (p_max - p_min)

# Find x0 (midpoint) and k (steepness) to satisfy our target point
x0 = 0.30 # Midpoint of normalized range

# Calculate base k value to hit the target point
base_k = -np.log(1 / y_norm - 1) / (x_norm - x0)

# Apply steepness factor (lower = less steep)
k = base_k * steepness_factor

# With reduced steepness, we need to adjust x0 to still hit the target point # noqa: E501
# Solve for new x0: y = 1/(1+e^(-k(x-x0))) => x0 = x + ln(1/y-1)/k
adjusted_x0 = x_norm + np.log(1 / y_norm - 1) / k

# Apply the sigmoid with our calculated parameters
x_scaled = (confidence - false_threshold) / (
true_threshold - false_threshold
)
sigmoid_value = 1 / (1 + np.exp(-k * (x_scaled - adjusted_x0)))

# Ensure we still hit exactly p_min and p_max at the thresholds
# by rescaling the output slightly
min_val = 1 / (1 + np.exp(-k * (0 - adjusted_x0)))
max_val = 1 / (1 + np.exp(-k * (1 - adjusted_x0)))

# Normalize the output
normalized = (sigmoid_value - min_val) / (max_val - min_val)

return p_min + normalized * (p_max - p_min)
2 changes: 1 addition & 1 deletion cvias/vision_language/internvl/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def infer_with_image_confidence(
self.tokenizer, pixel_values, language, generation_config
)

def chat_with_confidence( # noqa: PLR0913
def chat_with_confidence(
self,
tokenizer: AutoTokenizer,
pixel_values: torch.Tensor,
Expand Down
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespaces = true # to disable scanning PEP 420 namespaces (true by default)

[project]
name = "cvias"
version = "0.1.3-dev"
version = "0.1.4-dev"
authors = [
{ name = "Coargus Engineering", email = "engineering@coargus.com" },
]
Expand All @@ -22,6 +22,12 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
"torch==2.4.0",
"torchvision==0.19.0",
"torchaudio==2.4.0",
"opencv-python==4.10.0.84",
"transformers==4.47.0",
"vllm==0.6.3.post1",
"cog_cv_abstraction@git+https://github.com/Coargus/computer-vision-model-abstraction.git@v0.0.5-dev",
"cogutil@git+https://github.com/Coargus/cogutil.git@v0.0.1",
"cogcvutil@git+https://github.com/Coargus/cogcvutil.git",
Expand Down
2 changes: 1 addition & 1 deletion ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ select = [
"RUF",
]

ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004"]
ignore = ["ANN101","ANN102","COM","EXE","PD","S307","FBT001","FBT002","ISC001","G004","PLR0913"]


# Allow fix for all enabled rules (when `--fix`) is provided.
Expand Down
15 changes: 15 additions & 0 deletions scripts/start_vllm_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

MODEL="OpenGVLab/InternVL2_5-8B"
# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
# export NCCL_P2P_DISABLE=1
# export CUDA_VISIBLE_DEVICES="0"
# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
PORT=8000
vllm serve $MODEL \
--port $PORT \
--trust-remote-code \
--limit-mm-per-prompt image=4 \
--max-model-len 8192 \
--gpu-memory-utilization 0.97 \
--disable-log-requests
Loading