moves vectorizer dtype to base class attribute

justin-cechmanek · tylerhutcherson · commit 52b1b60952ef · 2025-01-06T14:21:54.000-05:00
diff --git a/redisvl/utils/vectorize/base.py b/redisvl/utils/vectorize/base.py
@@ -5,6 +5,7 @@
 from pydantic.v1 import BaseModel, validator
 
 from redisvl.redis.utils import array_to_buffer
+from redisvl.schema.fields import VectorDataType
 
 
 class Vectorizers(Enum):
@@ -19,11 +20,22 @@ class Vectorizers(Enum):
 class BaseVectorizer(BaseModel, ABC):
     model: str
     dims: int
+    dtype: str
 
     @property
     def type(self) -> str:
         return "base"
 
+    @validator("dtype")
+    def check_dtype(dtype):
+        try:
+            VectorDataType(dtype.upper())
+        except ValueError:
+            raise ValueError(
+                f"Invalid data type: {dtype}. Supported types are: {[t.lower() for t in VectorDataType]}"
+            )
+        return dtype
+
     @validator("dims")
     @classmethod
     def check_dims(cls, value):
diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py
@@ -52,7 +52,10 @@ class AzureOpenAITextVectorizer(BaseVectorizer):
     _aclient: Any = PrivateAttr()
 
     def __init__(
-        self, model: str = "text-embedding-ada-002", api_config: Optional[Dict] = None
+        self,
+        model: str = "text-embedding-ada-002",
+        api_config: Optional[Dict] = None,
+        dtype: str = "float32",
     ):
         """Initialize the AzureOpenAI vectorizer.
 
@@ -63,13 +66,17 @@ def __init__(
             api_config (Optional[Dict], optional): Dictionary containing the
                 API key, API version, Azure endpoint, and any other API options.
                 Defaults to None.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
+                Defaults to 'float32'.
 
         Raises:
             ImportError: If the openai library is not installed.
             ValueError: If the AzureOpenAI API key, version, or endpoint are not provided.
+            ValueError: If an invalid dtype is provided.
         """
         self._initialize_clients(api_config)
-        super().__init__(model=model, dims=self._set_model_dims(model))
+        super().__init__(model=model, dims=self._set_model_dims(model), dtype=dtype)
 
     def _initialize_clients(self, api_config: Optional[Dict]):
         """
@@ -190,7 +197,7 @@ def embed_many(
         if len(texts) > 0 and not isinstance(texts[0], str):
             raise TypeError("Must pass in a list of str values to embed.")
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
@@ -234,7 +241,7 @@ def embed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = self._client.embeddings.create(input=[text], model=self.model)
         return self._process_embedding(result.data[0].embedding, as_buffer, dtype)
@@ -274,7 +281,7 @@ async def aembed_many(
         if len(texts) > 0 and not isinstance(texts[0], str):
             raise TypeError("Must pass in a list of str values to embed.")
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
@@ -320,7 +327,7 @@ async def aembed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = await self._aclient.embeddings.create(input=[text], model=self.model)
         return self._process_embedding(result.data[0].embedding, as_buffer, dtype)
diff --git a/redisvl/utils/vectorize/text/bedrock.py b/redisvl/utils/vectorize/text/bedrock.py
@@ -49,6 +49,7 @@ def __init__(
         self,
         model: str = "amazon.titan-embed-text-v2:0",
         api_config: Optional[Dict[str, str]] = None,
+        dtype: str = "float32",
     ) -> None:
         """Initialize the AWS Bedrock Vectorizer.
 
@@ -57,10 +58,13 @@ def __init__(
             api_config (Optional[Dict[str, str]]): AWS credentials and config.
                 Can include: aws_access_key_id, aws_secret_access_key, aws_region
                 If not provided, will use environment variables.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
 
         Raises:
             ValueError: If credentials are not provided in config or environment.
             ImportError: If boto3 is not installed.
+            ValueError: If an invalid dtype is provided.
         """
         try:
             import boto3  # type: ignore
@@ -94,7 +98,7 @@ def __init__(
             region_name=aws_region,
         )
 
-        super().__init__(model=model, dims=self._set_model_dims(model))
+        super().__init__(model=model, dims=self._set_model_dims(model), dtype=dtype)
 
     def _set_model_dims(self, model: str) -> int:
         """Initialize model and determine embedding dimensions."""
@@ -145,7 +149,7 @@ def embed(
         response_body = json.loads(response["body"].read())
         embedding = response_body["embedding"]
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
         return self._process_embedding(embedding, as_buffer, dtype)
 
     @retry(
@@ -181,7 +185,7 @@ def embed_many(
             raise TypeError("Texts must be a list of strings")
 
         embeddings: List[List[float]] = []
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         for batch in self.batchify(texts, batch_size, preprocess):
             # Process each text in the batch individually since Bedrock
diff --git a/redisvl/utils/vectorize/text/cohere.py b/redisvl/utils/vectorize/text/cohere.py
@@ -47,7 +47,10 @@ class CohereTextVectorizer(BaseVectorizer):
     _client: Any = PrivateAttr()
 
     def __init__(
-        self, model: str = "embed-english-v3.0", api_config: Optional[Dict] = None
+        self,
+        model: str = "embed-english-v3.0",
+        api_config: Optional[Dict] = None,
+        dtype: str = "float32",
     ):
         """Initialize the Cohere vectorizer.
 
@@ -57,14 +60,17 @@ def __init__(
             model (str): Model to use for embedding. Defaults to 'embed-english-v3.0'.
             api_config (Optional[Dict], optional): Dictionary containing the API key.
                 Defaults to None.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
+                Defaults to 'float32'.
 
         Raises:
             ImportError: If the cohere library is not installed.
             ValueError: If the API key is not provided.
-
+            ValueError: If an invalid dtype is provided.
         """
         self._initialize_client(api_config)
-        super().__init__(model=model, dims=self._set_model_dims(model))
+        super().__init__(model=model, dims=self._set_model_dims(model), dtype=dtype)
 
     def _initialize_client(self, api_config: Optional[Dict]):
         """
@@ -159,7 +165,7 @@ def embed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embedding = self._client.embed(
             texts=[text], model=self.model, input_type=input_type
@@ -228,7 +234,7 @@ def embed_many(
                     See https://docs.cohere.com/reference/embed."
             )
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
diff --git a/redisvl/utils/vectorize/text/custom.py b/redisvl/utils/vectorize/text/custom.py
@@ -7,7 +7,7 @@
 
 
 class CustomTextVectorizer(BaseVectorizer):
-    """The CustomTextVectorizer class wraps user-defined embeding methods to create
+    """The CustomTextVectorizer class wraps user-defined embedding methods to create
     embeddings for text data.
 
     This vectorizer is designed to accept a provided callable text vectorizer and
@@ -44,6 +44,7 @@ def __init__(
         embed_many: Optional[Callable] = None,
         aembed: Optional[Callable] = None,
         aembed_many: Optional[Callable] = None,
+        dtype: str = "float32",
     ):
         """Initialize the Custom vectorizer.
 
@@ -52,10 +53,14 @@ def __init__(
             embed_many (Optional[Callable)]: a Callable function that accepts a list of string objects and returns a list containing lists of floats. Defaults to None.
             aembed (Optional[Callable]): an asyncronous Callable function that accepts a string object and returns a lists of floats. Defaults to None.
             aembed_many (Optional[Callable]):  an asyncronous Callable function that accepts a list of string objects and returns a list containing lists of floats. Defaults to None.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
+                Defaults to 'float32'.
 
         Raises:
-            ValueError if any of the provided functions accept or return incorrect types.
-            TypeError if any of the provided functions are not Callable objects.
+            ValueError: if any of the provided functions accept or return incorrect types.
+            TypeError: if any of the provided functions are not Callable objects.
+            ValueError: If an invalid dtype is provided.
         """
 
         self._validate_embed(embed)
@@ -71,7 +76,7 @@ def __init__(
             self._validate_aembed_many(aembed_many)
             self._aembed_many_func = aembed_many
 
-        super().__init__(model=self.type, dims=self._set_model_dims())
+        super().__init__(model=self.type, dims=self._set_model_dims(), dtype=dtype)
 
     def _validate_embed(self, func: Callable):
         """calls the func with dummy input and validates that it returns a vector"""
@@ -173,7 +178,7 @@ def embed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = self._embed_func(text, **kwargs)
         return self._process_embedding(result, as_buffer, dtype)
@@ -212,7 +217,7 @@ def embed_many(
         if not self._embed_many_func:
             raise NotImplementedError
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
@@ -254,7 +259,7 @@ async def aembed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = await self._aembed_func(text, **kwargs)
         return self._process_embedding(result, as_buffer, dtype)
@@ -293,7 +298,7 @@ async def aembed_many(
         if not self._aembed_many_func:
             raise NotImplementedError
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
diff --git a/redisvl/utils/vectorize/text/huggingface.py b/redisvl/utils/vectorize/text/huggingface.py
@@ -33,21 +33,28 @@ class HFTextVectorizer(BaseVectorizer):
     _client: Any = PrivateAttr()
 
     def __init__(
-        self, model: str = "sentence-transformers/all-mpnet-base-v2", **kwargs
+        self,
+        model: str = "sentence-transformers/all-mpnet-base-v2",
+        dtype: str = "float32",
+        **kwargs,
     ):
         """Initialize the Hugging Face text vectorizer.
 
         Args:
             model (str): The pre-trained model from Hugging Face's Sentence
                 Transformers to be used for embedding. Defaults to
                 'sentence-transformers/all-mpnet-base-v2'.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
+                Defaults to 'float32'.
 
         Raises:
             ImportError: If the sentence-transformers library is not installed.
             ValueError: If there is an error setting the embedding model dimensions.
+            ValueError: If an invalid dtype is provided.
         """
         self._initialize_client(model)
-        super().__init__(model=model, dims=self._set_model_dims())
+        super().__init__(model=model, dims=self._set_model_dims(), dtype=dtype)
 
     def _initialize_client(self, model: str):
         """Setup the HuggingFace client"""
@@ -100,7 +107,7 @@ def embed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embedding = self._client.encode([text], **kwargs)[0]
         return self._process_embedding(embedding.tolist(), as_buffer, dtype)
@@ -136,7 +143,7 @@ def embed_many(
         if len(texts) > 0 and not isinstance(texts[0], str):
             raise TypeError("Must pass in a list of str values to embed.")
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
diff --git a/redisvl/utils/vectorize/text/mistral.py b/redisvl/utils/vectorize/text/mistral.py
@@ -46,21 +46,30 @@ class MistralAITextVectorizer(BaseVectorizer):
     _client: Any = PrivateAttr()
     _aclient: Any = PrivateAttr()
 
-    def __init__(self, model: str = "mistral-embed", api_config: Optional[Dict] = None):
+    def __init__(
+        self,
+        model: str = "mistral-embed",
+        api_config: Optional[Dict] = None,
+        dtype: str = "float32",
+    ):
         """Initialize the MistralAI vectorizer.
 
         Args:
             model (str): Model to use for embedding. Defaults to
                 'text-embedding-ada-002'.
             api_config (Optional[Dict], optional): Dictionary containing the
                 API key. Defaults to None.
+            dtype (str): the default datatype to use when embedding text as byte arrays.
+                Used when setting `as_buffer=True` in calls to embed() and embed_many().
+                Defaults to 'float32'.
 
         Raises:
             ImportError: If the mistralai library is not installed.
             ValueError: If the Mistral API key is not provided.
+            ValueError: If an invalid dtype is provided.
         """
         self._initialize_clients(api_config)
-        super().__init__(model=model, dims=self._set_model_dims(model))
+        super().__init__(model=model, dims=self._set_model_dims(model), dtype=dtype)
 
     def _initialize_clients(self, api_config: Optional[Dict]):
         """
@@ -140,7 +149,7 @@ def embed_many(
         if len(texts) > 0 and not isinstance(texts[0], str):
             raise TypeError("Must pass in a list of str values to embed.")
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
@@ -184,7 +193,7 @@ def embed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = self._client.embeddings(model=self.model, input=[text])
         return self._process_embedding(result.data[0].embedding, as_buffer, dtype)
@@ -224,7 +233,7 @@ async def aembed_many(
         if len(texts) > 0 and not isinstance(texts[0], str):
             raise TypeError("Must pass in a list of str values to embed.")
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         embeddings: List = []
         for batch in self.batchify(texts, batch_size, preprocess):
@@ -268,7 +277,7 @@ async def aembed(
         if preprocess:
             text = preprocess(text)
 
-        dtype = kwargs.pop("dtype", "float32")
+        dtype = kwargs.pop("dtype", self.dtype)
 
         result = await self._aclient.embeddings(model=self.model, input=[text])
         return self._process_embedding(result.data[0].embedding, as_buffer, dtype)
diff --git a/redisvl/utils/vectorize/text/openai.py b/redisvl/utils/vectorize/text/openai.py
diff --git a/redisvl/utils/vectorize/text/vertexai.py b/redisvl/utils/vectorize/text/vertexai.py
diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py