diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..863447565 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,104 @@ +# NanoBEIR Evaluation Example + +This example demonstrates how to use the `vespa.nanobeir` module to easily configure and run NanoBEIR evaluations with different embedding models. + +## Overview + +The `vespa.nanobeir` module provides utilities to simplify the creation of Vespa applications for information retrieval evaluation. It handles the complexity of configuring different embedding models with varying dimensions, tokenizers, and binary vs. float embeddings. + +## Key Features + +- **Model-centric configuration**: All model-specific parameters (dimension, tokenizer, binarization) are encapsulated in a `ModelConfig` object +- **Automatic field type selection**: The embedding field type is automatically set to `tensor` or `tensor` based on whether embeddings are binarized +- **Automatic indexing configuration**: For binarized embeddings, `pack_bits` is automatically added to the indexing statement +- **Distance metric selection**: Uses hamming distance for binarized embeddings and cosine similarity (angular distance) for float embeddings +- **Predefined models**: Includes configurations for common models like e5-small-v2, e5-base-v2, snowflake-arctic-embed, and bge-m3 + +## Usage + +### Basic Example + +```python +from vespa.nanobeir import get_model_config, create_evaluation_package + +# Get a predefined model configuration +config = get_model_config("e5-small-v2") + +# Create a complete application package +package = create_evaluation_package(config, app_name="myeval") + +# Deploy to Vespa Cloud or local Docker +# ... (deployment code) +``` + +### Custom Model Configuration + +```python +from vespa.nanobeir import ModelConfig, create_embedder_component, create_embedding_field + +# Define a custom model +config = ModelConfig( + model_id="my-custom-model", + embedding_dim=512, + tokenizer_id="bert-base-uncased", + binarized=False, +) + +# Create individual components +embedder = create_embedder_component(config) +embedding_field = create_embedding_field(config) +``` + +### Binary Embeddings + +```python +from vespa.nanobeir import ModelConfig + +# Configure for binary embeddings +config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, # Before packing + binarized=True, +) + +# The resulting field will be tensor(x[128]) with pack_bits in indexing +# The ranking profile will use hamming distance +``` + +## Running the Example + +```bash +# From the repository root +uv run python examples/nanobeir_evaluation_example.py +``` + +This will demonstrate: +1. Creating packages for different float embedding models (e5-small-v2, e5-base-v2) +2. Creating a package for binary embeddings (bge-m3-binary) +3. Creating a package with custom model configuration +4. Listing all available predefined models + +## Available Predefined Models + +- `e5-small-v2`: 384-dimensional float embeddings +- `e5-base-v2`: 768-dimensional float embeddings +- `snowflake-arctic-embed-xs`: 384-dimensional float embeddings +- `snowflake-arctic-embed-s`: 384-dimensional float embeddings +- `snowflake-arctic-embed-m`: 768-dimensional float embeddings +- `bge-m3-binary`: 1024-dimensional binary embeddings (packed to 128 int8 values) + +## Next Steps + +After creating an application package: + +1. **Deploy to Vespa**: Use `VespaCloud` or `VespaDocker` to deploy your application +2. **Feed documents**: Load the NanoBEIR dataset and feed documents to Vespa +3. **Run evaluation**: Use `VespaEvaluator` or `VespaMatchEvaluator` to evaluate retrieval quality +4. **Compare models**: Run the same evaluation with different model configurations to compare performance + +## Related Documentation + +- [vespa.nanobeir API Reference](../vespa/nanobeir.py) +- [vespa.evaluation API Reference](../vespa/evaluation.py) +- [Vespa Documentation - Embeddings](https://docs.vespa.ai/en/embedding.html) +- [Vespa Documentation - Binary Quantization](https://docs.vespa.ai/en/embedding.html#binary-quantization) diff --git a/examples/nanobeir_evaluation_example.py b/examples/nanobeir_evaluation_example.py new file mode 100644 index 000000000..e7d676f8b --- /dev/null +++ b/examples/nanobeir_evaluation_example.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating NanoBEIR evaluation with different models. + +This script shows how to easily switch between different embedding models +for evaluation, handling differences in embedding dimensions, tokenizers, +and binary vs. float embeddings. +""" + +from vespa.nanobeir import ( + ModelConfig, + get_model_config, + create_embedder_component, + create_embedding_field, + create_evaluation_package, +) + + +def main(): + """ + Main function demonstrating evaluation setup with different models. + """ + print("NanoBEIR Evaluation Example") + print("=" * 60) + + # Example 1: Single model by name (e5-small-v2) + print("\n1. Single model: e5-small-v2 (float embeddings, 384 dim)") + print("-" * 60) + package_e5_small = create_evaluation_package( + "e5-small-v2", + app_name="nanobeirsmall", + ) + config_e5_small = get_model_config("e5-small-v2") + print(f" Model: {config_e5_small.model_id}") + print(f" Embedding dim: {config_e5_small.embedding_dim}") + print(f" Binarized: {config_e5_small.binarized}") + print(f" Component ID: {config_e5_small.component_id}") + embedding_field = package_e5_small.schema.document.fields[2] + print(f" Schema embedding field name: {embedding_field.name}") + print(f" Schema embedding field type: {embedding_field.type}") + print(f" Number of components: {len(package_e5_small.components)}") + print(f" Number of rank profiles: {len(package_e5_small.schema.rank_profiles)}") + profile_names = [ + p.name if hasattr(p, "name") else str(p) + for p in package_e5_small.schema.rank_profiles + ] + print(f" Rank profile names: {profile_names}") + + # Example 2: Single model with custom config + print("\n2. Single model with custom config (512 dim)") + print("-" * 60) + custom_config = ModelConfig( + model_id="custom-embedding-model", + embedding_dim=512, + tokenizer_id="bert-base-uncased", + binarized=False, + ) + package_custom = create_evaluation_package( + custom_config, + app_name="nanobeircustom", + ) + print(f" Model: {custom_config.model_id}") + print(f" Tokenizer: {custom_config.tokenizer_id}") + print(f" Embedding dim: {custom_config.embedding_dim}") + embedding_field = package_custom.schema.document.fields[2] + print(f" Schema embedding field name: {embedding_field.name}") + print(f" Schema embedding field type: {embedding_field.type}") + + # Example 3: Multiple models (e5-small-v2 and e5-base-v2) + print("\n3. Multiple models: e5-small-v2 (384 dim) + e5-base-v2 (768 dim)") + print("-" * 60) + package_multi = create_evaluation_package( + ["e5-small-v2", "e5-base-v2"], + app_name="nanobeirmulti", + ) + print(" Number of models: 2") + print(f" Number of components: {len(package_multi.components)}") + print(f" Component IDs: {[c.id for c in package_multi.components]}") + embedding_fields = [ + f + for f in package_multi.schema.document.fields + if f.name.startswith("embedding") + ] + print(f" Number of embedding fields: {len(embedding_fields)}") + print(f" Embedding field names: {[f.name for f in embedding_fields]}") + print(f" Embedding field types: {[f.type for f in embedding_fields]}") + print(f" Number of rank profiles: {len(package_multi.schema.rank_profiles)}") + profile_names_multi = [ + p.name if hasattr(p, "name") else str(p) + for p in package_multi.schema.rank_profiles + ] + print(f" Rank profile names: {profile_names_multi}") + + # Example 4: Multiple models with mixed configs (name + custom config) + print("\n4. Multiple models: e5-small-v2 + custom model (mixed configs)") + print("-" * 60) + custom_mixed = ModelConfig( + model_id="my-custom-embedder", + embedding_dim=256, + binarized=False, + ) + package_mixed = create_evaluation_package( + ["e5-small-v2", custom_mixed], + app_name="nanobeirmixed", + ) + print(f" Number of components: {len(package_mixed.components)}") + print(f" Component IDs: {[c.id for c in package_mixed.components]}") + embedding_fields_mixed = [ + f + for f in package_mixed.schema.document.fields + if f.name.startswith("embedding") + ] + print(f" Embedding field names: {[f.name for f in embedding_fields_mixed]}") + print(f" Embedding field types: {[f.type for f in embedding_fields_mixed]}") + + # Example 5: ModernBERT with advanced configuration + print("\n5. Single model: nomic-ai-modernbert (ModernBERT-based, 768 dim)") + print("-" * 60) + config_modernbert = get_model_config("nomic-ai-modernbert") + package_modernbert = create_evaluation_package( + "nomic-ai-modernbert", + app_name="nanobeirmodern", + ) + print(f" Model: {config_modernbert.model_id}") + print(f" Embedding dim: {config_modernbert.embedding_dim}") + print(f" Max tokens: {config_modernbert.max_tokens}") + print(f" Transformer output: {config_modernbert.transformer_output}") + print(f" Query prepend: {config_modernbert.query_prepend}") + print(f" Document prepend: {config_modernbert.document_prepend}") + embedding_field = package_modernbert.schema.document.fields[2] + print(f" Schema embedding field name: {embedding_field.name}") + print(f" Schema embedding field type: {embedding_field.type}") + print(f" Distance metric: {embedding_field.ann.distance_metric}") + + # Example 6: List all available predefined models + print("\n6. Available predefined models:") + print("-" * 60) + from vespa.nanobeir import COMMON_MODELS + + for model_name, config in COMMON_MODELS.items(): + binary_str = " (binary)" if config.binarized else "" + print(f" - {model_name}: {config.embedding_dim} dim{binary_str}") + + # Example 7: Advanced configuration with URL-based models + print("\n7. Advanced configuration: URL-based model with custom parameters") + print("-" * 60) + gte_config = ModelConfig( + model_id="gte-multilingual-base", + embedding_dim=768, + component_id="gte_multilingual", + model_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/onnx/model_quantized.onnx", + tokenizer_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/tokenizer.json", + transformer_output="token_embeddings", + max_tokens=8192, + query_prepend="Represent this sentence for searching relevant passages: ", + document_prepend="passage: ", + ) + + embedder = create_embedder_component(gte_config) + embedding_field = create_embedding_field(gte_config) + + print(f" Model: {gte_config.model_id}") + print(f" Embedding dim: {gte_config.embedding_dim}") + print(f" Component ID: {embedder.id}") + print(f" Max tokens: {gte_config.max_tokens}") + print(f" Transformer output: {gte_config.transformer_output}") + print(f" Query prepend: {gte_config.query_prepend[:50]}...") + print(f" Document prepend: {gte_config.document_prepend}") + print(f" Number of parameters: {len(embedder.parameters)}") + print(f" Schema embedding field type: {embedding_field.type}") + + print("\n" + "=" * 60) + print("Example complete!") + print("\nNext steps:") + print("1. Deploy the package to Vespa Cloud or local Docker") + print("2. Load NanoBEIR dataset and feed documents") + print("3. Run evaluation using VespaEvaluator or VespaMatchEvaluator") + print("4. Compare results across different models") + print("\nAdvanced features demonstrated:") + print("- Using predefined model configurations") + print("- Creating custom model configurations") + print("- Single model setup with simple function call") + print("- Multiple model setup with automatic field/component naming") + print("- Mixed model configurations (predefined + custom)") + print("- Binary vs. float embeddings") + print("- URL-based model loading") + print("- Additional embedder parameters (transformer-output, max-tokens, prepend)") + print("\nKey benefits of multi-model support:") + print("- Evaluate multiple models in single deployment") + print("- Compare model performance side-by-side") + print("- Automatic conflict resolution (fields/components named uniquely)") + print("- Each model gets its own set of rank profiles") + + +if __name__ == "__main__": + main() diff --git a/examples/run_nanobeir_eval.py b/examples/run_nanobeir_eval.py new file mode 100644 index 000000000..7b596bfb9 --- /dev/null +++ b/examples/run_nanobeir_eval.py @@ -0,0 +1,619 @@ +#!/usr/bin/env python3 +""" +NanoBEIR Evaluation Runner + +This script demonstrates how to run a complete evaluation workflow using the +NanoBEIR dataset with multiple embedding models. It creates a Vespa application, +feeds documents, runs evaluation queries, and saves results to CSV files. +""" + +import os +import pandas as pd +import vespa.querybuilder as qb +from datasets import load_dataset +from vespa.application import Vespa +from vespa.deployment import VespaCloud, VespaDocker +from vespa.evaluation import VespaMatchEvaluator, VespaEvaluator +from vespa.io import VespaResponse +from vespa.nanobeir import create_evaluation_package, get_model_config, ModelConfig +from enum import Enum + +# Configuration +TENANT_NAME = os.getenv("VESPA_TENANT_NAME", "vespa-team") +APPLICATION = "nanobeireval" +SCHEMA_NAME = "doc" +DATASET_ID = "zeta-alpha-ai/NanoMSMARCO" + + +class DeployTarget(Enum): + VESPA_CLOUD = "vespa_cloud" + LOCAL = "local" + + +TARGET = DeployTarget.LOCAL + +# Models to evaluate - you can modify this list +# Can be: +# - Predefined model names (strings): "e5-small-v2", "nomic-ai-modernbert", etc. +# - Custom ModelConfig objects for models not in the predefined list +# Example with custom config: +# MODELS = [ +# "e5-small-v2", +# ModelConfig( +# model_id="custom-model", +# embedding_dim=384, +# binarized=False, +# query_prepend="query: ", +# document_prepend="document: ", +# ) +# ] +kalm_model = ModelConfig( + model_id="kalm", + model_url="https://huggingface.co/thomasht86/KaLM-embedding-multilingual-mini-instruct-v2.5-ONNX/resolve/main/onnx/model_int8.onnx", + tokenizer_url="https://huggingface.co/thomasht86/KaLM-embedding-multilingual-mini-instruct-v2.5-ONNX/resolve/main/tokenizer.json", + transformer_output="token_embeddings", + embedding_dim=896, + binarized=False, + query_prepend="Instruct: Given a query, retrieve documents that answer the query \n Query: ", +) +# 'https://data.vespa-cloud.com/onnx_models/e5-small-v2/model.onnx' +# 'https://data.vespa-cloud.com/onnx_models/e5-small-v2/tokenizer.json' +e5_small_v2 = ModelConfig( + model_id="e5_small_v2", + model_url="https://data.vespa-cloud.com/onnx_models/e5-small-v2/model.onnx", + tokenizer_url="https://data.vespa-cloud.com/onnx_models/e5-small-v2/tokenizer.json", + embedding_dim=384, + binarized=False, + max_tokens=512, + query_prepend="query: ", + document_prepend="passage: ", +) + +MODELS = [e5_small_v2, kalm_model] + + +def feed_data(app: Vespa, dataset_id: str, schema_name: str): + """ + Load and feed the NanoBEIR dataset to Vespa. + + Args: + app: Vespa application instance + dataset_id: HuggingFace dataset identifier + schema_name: Name of the Vespa schema + """ + print(f"\nLoading dataset: {dataset_id}") + dataset = load_dataset(dataset_id, "corpus", split="train", streaming=True) + + vespa_feed = dataset.map( + lambda x: { + "id": x["_id"], + "fields": {"text": x["text"], "id": x["_id"]}, + } + ) + + def callback(response: VespaResponse, id: str): + if not response.is_successful(): + print(f"Error when feeding document {id}: {response.get_json()}") + + print("Feeding documents to Vespa...") + app.feed_iterable( + vespa_feed, + schema=schema_name, + namespace="nanobeir", + callback=callback, + ) + print("Feeding complete!") + + +def load_queries_and_qrels(dataset_id: str): + """ + Load queries and relevance judgments from the dataset. + + Args: + dataset_id: HuggingFace dataset identifier + + Returns: + Tuple of (queries dict, relevant_docs dict) + """ + print("\nLoading queries and relevance judgments...") + query_ds = load_dataset(dataset_id, "queries", split="train") + qrels = load_dataset(dataset_id, "qrels", split="train") + + queries = dict(zip(query_ds["_id"], query_ds["text"])) + relevant_docs = dict(zip(qrels["query-id"], qrels["corpus-id"])) + + print(f"Loaded {len(queries)} queries and {len(relevant_docs)} relevance judgments") + return queries, relevant_docs + + +def create_query_functions(model_configs, schema_name: str): + """ + Create query functions for different retrieval strategies. + + Args: + model_configs: List of ModelConfig objects + schema_name: Name of the Vespa schema + + Returns: + Dictionary mapping strategy names to query functions + """ + is_multi_model = len(model_configs) > 1 + query_functions = {} + + for config in model_configs: + # Determine naming based on single vs multi-model setup + if is_multi_model: + embedding_field = f"embedding_{config.component_id}" + query_tensor = f"q_{config.component_id}" + profile_suffix = f"_{config.component_id}" + model_label = f"_{config.component_id}" + else: + embedding_field = "embedding" + query_tensor = "q" + profile_suffix = "" + model_label = "" + + # Match strategies (for VespaMatchEvaluator) + def make_semantic_match_fn(embedding_field, query_tensor, embedder_id): + def semantic_match_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where( + qb.nearestNeighbor( + field=embedding_field, + query_vector=query_tensor, + annotations={"targetHits": 100}, + ) + ) + ), + "query": query_text, + "ranking": "match-only", + f"input.query({query_tensor})": f"embed({embedder_id}, '{query_text}')", + } + + return semantic_match_query_fn + + def make_weakand_match_fn(embedder_id): + def weakand_match_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where(qb.userQuery(query_text)) + ), + "query": query_text, + "ranking": "match-only", + "input.query(q)": f"embed({embedder_id}, '{query_text}')", + } + + return weakand_match_query_fn + + def make_hybrid_match_fn(embedding_field, query_tensor, embedder_id): + def hybrid_match_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where( + qb.nearestNeighbor( + field=embedding_field, + query_vector=query_tensor, + annotations={"targetHits": 100}, + ) + | qb.userQuery(query_text) + ) + ), + "query": query_text, + "ranking": "match-only", + f"input.query({query_tensor})": f"embed({embedder_id}, '{query_text}')", + } + + return hybrid_match_query_fn + + # Ranking strategies (for VespaEvaluator) + def make_semantic_fn( + embedding_field, query_tensor, profile_suffix, embedder_id + ): + def semantic_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where( + qb.nearestNeighbor( + field=embedding_field, + query_vector=query_tensor, + annotations={"targetHits": 100}, + ) + ) + ), + "query": query_text, + "ranking": f"semantic{profile_suffix}", + f"input.query({query_tensor})": f"embed({embedder_id}, '{query_text}')", + "hits": top_k, + } + + return semantic_query_fn + + def make_bm25_fn(profile_suffix): + def bm25_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": "select * from sources * where userQuery();", + "query": query_text, + "ranking": f"bm25{profile_suffix}", + "hits": top_k, + } + + return bm25_query_fn + + def make_fusion_fn(embedding_field, query_tensor, profile_suffix, embedder_id): + def fusion_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where( + qb.nearestNeighbor( + field=embedding_field, + query_vector=query_tensor, + annotations={"targetHits": 100}, + ) + | qb.userQuery(query_text) + ) + ), + "query": query_text, + "ranking": f"fusion{profile_suffix}", + f"input.query({query_tensor})": f"embed({embedder_id}, '{query_text}')", + "hits": top_k, + } + + return fusion_query_fn + + def make_atan_norm_fn( + embedding_field, query_tensor, profile_suffix, embedder_id + ): + def atan_norm_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*") + .from_(schema_name) + .where( + qb.nearestNeighbor( + field=embedding_field, + query_vector=query_tensor, + annotations={"targetHits": 100}, + ) + | qb.userQuery(query_text) + ) + ), + "query": query_text, + "ranking": f"atan_norm{profile_suffix}", + f"input.query({query_tensor})": f"embed({embedder_id}, '{query_text}')", + "hits": top_k, + } + + return atan_norm_query_fn + + # Add match strategies + query_functions[f"match_semantic{model_label}"] = make_semantic_match_fn( + embedding_field, query_tensor, config.component_id + ) + query_functions[f"match_hybrid{model_label}"] = make_hybrid_match_fn( + embedding_field, query_tensor, config.component_id + ) + + # Add ranking strategies + query_functions[f"semantic{model_label}"] = make_semantic_fn( + embedding_field, query_tensor, profile_suffix, config.component_id + ) + query_functions[f"bm25{model_label}"] = make_bm25_fn(profile_suffix) + query_functions[f"fusion{model_label}"] = make_fusion_fn( + embedding_field, query_tensor, profile_suffix, config.component_id + ) + query_functions[f"atan_norm{model_label}"] = make_atan_norm_fn( + embedding_field, query_tensor, profile_suffix, config.component_id + ) + + # Add weakand match strategy (only once, not model-specific) + # Use the first model's embedder_id for consistency + first_embedder_id = model_configs[0].component_id + + def weakand_match_query_fn(query_text: str, top_k: int) -> dict: + return { + "yql": str( + qb.select("*").from_(schema_name).where(qb.userQuery(query_text)) + ), + "query": query_text, + "ranking": "match-only", + "input.query(q)": f"embed({first_embedder_id}, '{query_text}')", + } + + query_functions["match_weakand"] = weakand_match_query_fn + + return query_functions + + +def run_match_evaluation( + app: Vespa, queries: dict, relevant_docs: dict, query_functions: dict +): + """ + Run match evaluation (VespaMatchEvaluator) for retrieval strategies. + + Args: + app: Vespa application instance + queries: Dictionary mapping query IDs to query text + relevant_docs: Dictionary mapping query IDs to relevant document IDs + query_functions: Dictionary mapping strategy names to query functions + + Returns: + DataFrame with match evaluation results + """ + print("\n" + "=" * 80) + print("RUNNING MATCH EVALUATION (Retrieval Phase)") + print("=" * 80) + + match_results = {} + match_strategies = [k for k in query_functions.keys() if k.startswith("match_")] + + for strategy_name in match_strategies: + print(f"\nEvaluating {strategy_name}...") + query_fn = query_functions[strategy_name] + + match_evaluator = VespaMatchEvaluator( + queries=queries, + relevant_docs=relevant_docs, + vespa_query_fn=query_fn, + app=app, + name=strategy_name, + id_field="id", + write_csv=True, + write_verbose=True, + ) + + results = match_evaluator() + match_results[strategy_name] = results + print(f"Results for {strategy_name}:") + print(results) + + return pd.DataFrame(match_results) + + +def run_ranking_evaluation( + app: Vespa, queries: dict, relevant_docs: dict, query_functions: dict +): + """ + Run ranking evaluation (VespaEvaluator) for ranking strategies. + + Args: + app: Vespa application instance + queries: Dictionary mapping query IDs to query text + relevant_docs: Dictionary mapping query IDs to relevant document IDs + query_functions: Dictionary mapping strategy names to query functions + + Returns: + DataFrame with ranking evaluation results + """ + print("\n" + "=" * 80) + print("RUNNING RANKING EVALUATION (Ranking Phase)") + print("=" * 80) + + ranking_results = {} + ranking_strategies = [ + k for k in query_functions.keys() if not k.startswith("match_") + ] + + for strategy_name in ranking_strategies: + print(f"\nEvaluating {strategy_name}...") + query_fn = query_functions[strategy_name] + + evaluator = VespaEvaluator( + queries=queries, + relevant_docs=relevant_docs, + vespa_query_fn=query_fn, + app=app, + name=strategy_name, + write_csv=True, + ) + + results = evaluator.run() + ranking_results[strategy_name] = results + + return pd.DataFrame(ranking_results) + + +def save_results( + match_results_df: pd.DataFrame, + ranking_results_df: pd.DataFrame, + output_dir: str = ".", +): + """ + Save evaluation results and create visualizations. + + Args: + match_results_df: DataFrame with match evaluation results + ranking_results_df: DataFrame with ranking evaluation results + output_dir: Directory to save results (default: current directory) + """ + print("\n" + "=" * 80) + print("SAVING RESULTS") + print("=" * 80) + + # Save match results + match_csv_path = os.path.join(output_dir, "nanobeir_match_results.csv") + match_results_df.to_csv(match_csv_path) + print(f"\nMatch results saved to: {match_csv_path}") + print("\nMatch Results Summary:") + print(match_results_df) + + # Save and process ranking results + ranking_csv_path = os.path.join(output_dir, "nanobeir_ranking_results.csv") + ranking_results_df.to_csv(ranking_csv_path) + print(f"\nRanking results saved to: {ranking_csv_path}") + + # Separate searchtime from other metrics + searchtime = ranking_results_df[ranking_results_df.index.str.contains("searchtime")] + metrics = ranking_results_df[~ranking_results_df.index.str.contains("searchtime")] + + # Save separate CSVs + metrics_csv_path = os.path.join(output_dir, "nanobeir_ranking_metrics.csv") + searchtime_csv_path = os.path.join(output_dir, "nanobeir_searchtime.csv") + metrics.to_csv(metrics_csv_path) + searchtime.to_csv(searchtime_csv_path) + print(f"Ranking metrics saved to: {metrics_csv_path}") + print(f"Search time saved to: {searchtime_csv_path}") + + print("\nRanking Metrics Summary:") + print(metrics) + + print("\nSearch Time Summary (ms):") + print(searchtime * 1000) + + # Try to create visualizations if matplotlib is available + try: + import matplotlib.pyplot as plt + + # Plot ranking metrics + fig, ax = plt.subplots(figsize=(12, 6)) + metrics.plot(kind="bar", ax=ax) + ax.set_title("NanoBEIR Ranking Metrics Comparison") + ax.set_ylabel("Score") + plt.tight_layout() + metrics_plot_path = os.path.join(output_dir, "nanobeir_ranking_metrics.png") + plt.savefig(metrics_plot_path) + print(f"\nRanking metrics plot saved to: {metrics_plot_path}") + plt.close() + + # Plot search time + fig, ax = plt.subplots(figsize=(12, 6)) + (searchtime * 1000).plot(kind="bar", ax=ax) + ax.set_title("NanoBEIR Search Time Comparison") + ax.set_ylabel("Time (ms)") + plt.tight_layout() + searchtime_plot_path = os.path.join(output_dir, "nanobeir_searchtime.png") + plt.savefig(searchtime_plot_path) + print(f"Search time plot saved to: {searchtime_plot_path}") + plt.close() + + except ImportError: + print("\nNote: matplotlib not available, skipping plot generation") + + +def main(): + """ + Main function to run the complete NanoBEIR evaluation workflow. + """ + print("=" * 80) + print("NanoBEIR EVALUATION RUNNER") + print("=" * 80) + print("\nConfiguration:") + print(f" Tenant: {TENANT_NAME}") + print(f" Application: {APPLICATION}") + print(f" Schema: {SCHEMA_NAME}") + print(f" Dataset: {DATASET_ID}") + print(f" Models: {MODELS}") + + # Create application package + print("\n" + "=" * 80) + print("CREATING APPLICATION PACKAGE") + print("=" * 80) + package = create_evaluation_package( + MODELS, + app_name=APPLICATION, + schema_name=SCHEMA_NAME, + ) + package.to_files("evaltest") + print("\nCreated package with:") + print(f" - {len(package.components)} embedding component(s)") + print(f" - {len(package.schema.rank_profiles)} rank profile(s)") + embedding_fields = [ + f for f in package.schema.document.fields if f.name.startswith("embedding") + ] + print(f" - {len(embedding_fields)} embedding field(s)") + + if TARGET == DeployTarget.VESPA_CLOUD: + # Deploy to Vespa Cloud + print("\n" + "=" * 80) + print("DEPLOYING TO VESPA CLOUD") + print("=" * 80) + vespa_cloud = VespaCloud( + tenant=TENANT_NAME, + application=APPLICATION, + key_content=os.getenv("VESPA_TEAM_API_KEY", None), + application_package=package, + ) + app: Vespa = vespa_cloud.deploy(max_wait=1800) + elif TARGET == DeployTarget.LOCAL: + # Deploy locally using Docker + print("\n" + "=" * 80) + print("DEPLOYING LOCALLY WITH DOCKER") + print("=" * 80) + vespa_docker = VespaDocker() + app: Vespa = vespa_docker.deploy( + application_package=package, + ) + print("Deployment successful!") + + try: + # Feed data + print("\n" + "=" * 80) + print("FEEDING DATA") + print("=" * 80) + feed_data(app, DATASET_ID, SCHEMA_NAME) + + # Load queries and qrels + queries, relevant_docs = load_queries_and_qrels(DATASET_ID) + + # Get model configs for query function creation + model_configs = [ + get_model_config(m) if isinstance(m, str) else m for m in MODELS + ] + + # Create query functions + print("\n" + "=" * 80) + print("CREATING QUERY FUNCTIONS") + print("=" * 80) + query_functions = create_query_functions(model_configs, SCHEMA_NAME) + print(f"Created {len(query_functions)} query functions:") + for name in query_functions.keys(): + print(f" - {name}") + + # Run match evaluation + match_results = run_match_evaluation( + app, queries, relevant_docs, query_functions + ) + + # Run ranking evaluation + ranking_results = run_ranking_evaluation( + app, queries, relevant_docs, query_functions + ) + + # Save results + save_results(match_results, ranking_results) + + print("\n" + "=" * 80) + print("EVALUATION COMPLETE!") + print("=" * 80) + + except Exception as e: + print("\n" + "=" * 80) + print("ERROR DURING EVALUATION") + print("=" * 80) + print(f"Exception: {type(e).__name__}: {e}") + import traceback + + traceback.print_exc() + raise + finally: + # Clean up + print("\n" + "=" * 80) + print("CLEANING UP") + print("=" * 80) + print("Deleting Vespa application...") + # vespa_cloud.delete() + print("Cleanup complete!") + + +if __name__ == "__main__": + main() diff --git a/tests/integration/evaluating-vespa-application-cloud.py b/tests/integration/evaluating-vespa-application-cloud.py deleted file mode 100644 index 7b78409d6..000000000 --- a/tests/integration/evaluating-vespa-application-cloud.py +++ /dev/null @@ -1,432 +0,0 @@ -# %% -import os -import pandas as pd -import vespa.querybuilder as qb -from datasets import load_dataset -from vespa.application import Vespa -from vespa.deployment import VespaCloud -from vespa.evaluation import VespaMatchEvaluator, VespaEvaluator -from vespa.io import VespaResponse -from vespa.package import ( - ApplicationPackage, - Field, - Schema, - Document, - HNSW, - RankProfile, - Component, - Parameter, - FieldSet, - GlobalPhaseRanking, - Function, -) - -# %% -tenant_name = "vespa-team" -application = "modernbert" -schema_name = "doc" - -# %% -package = ApplicationPackage( - name=application, - schema=[ - Schema( - name=schema_name, - document=Document( - fields=[ - # Note that we need an id field as attribute to be able to do evaluation - # Vespa internal query document id is used as fallback, but have some limitations, see https://docs.vespa.ai/en/document-v1-api-guide.html#query-result-id - Field(name="id", type="string", indexing=["summary", "attribute"]), - Field( - name="text", - type="string", - indexing=["index", "summary"], - index="enable-bm25", - bolding=True, - ), - Field( - name="embedding", - type="tensor(x[768])", - indexing=[ - "input text", - "embed", # uses default model - "index", - "attribute", - ], - ann=HNSW(distance_metric="angular"), - is_document_field=False, - ), - ] - ), - fieldsets=[FieldSet(name="default", fields=["text"])], - rank_profiles=[ - RankProfile( - name="match-only", - inputs=[("query(q)", "tensor(x[768])")], - first_phase="random", # TODO: Remove when pyvespa supports empty first_phase - ), - RankProfile( - name="bm25", - inputs=[("query(q)", "tensor(x[768])")], - functions=[Function(name="bm25text", expression="bm25(text)")], - first_phase="bm25text", - match_features=["bm25text"], - ), - RankProfile( - name="semantic", - inputs=[("query(q)", "tensor(x[768])")], - functions=[ - Function( - name="cos_sim", expression="closeness(field, embedding)" - ) - ], - first_phase="cos_sim", - match_features=["cos_sim"], - ), - RankProfile( - name="fusion", - inherits="bm25", - functions=[ - Function( - name="cos_sim", expression="closeness(field, embedding)" - ) - ], - inputs=[("query(q)", "tensor(x[768])")], - first_phase="cos_sim", - global_phase=GlobalPhaseRanking( - expression="reciprocal_rank_fusion(bm25text, closeness(field, embedding))", - rerank_count=1000, - ), - match_features=["cos_sim", "bm25text"], - ), - RankProfile( - name="atan_norm", - inherits="bm25", - inputs=[("query(q)", "tensor(x[768])")], - functions=[ - Function( - name="scale", - args=["val"], - expression="2*atan(val)/(3.14159)", - ), - Function( - name="normalized_bm25", expression="scale(bm25(text))" - ), - Function( - name="cos_sim", expression="closeness(field, embedding)" - ), - ], - first_phase="normalized_bm25", - global_phase=GlobalPhaseRanking( - expression="normalize_linear(normalized_bm25) + normalize_linear(cos_sim)", - rerank_count=1000, - ), - match_features=["cos_sim", "normalized_bm25"], - ), - ], - ) - ], - components=[ - Component( - id="modernbert", - type="hugging-face-embedder", - parameters=[ - Parameter( - name="transformer-model", - args={ - "url": "https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/onnx/model_quantized.onnx" - }, - ), - Parameter( - name="tokenizer-model", - args={ - "url": "https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/tokenizer.json" - }, - ), - Parameter( - name="transformer-output", - args={}, - children="token_embeddings", - ), - Parameter( - name="max-tokens", - args={}, - children="8192", - ), - Parameter( - name="prepend", - args={}, - children=[ - Parameter( - name="query", - args={}, - children="Represent this sentence for searching relevant passages: ", - ), - # Parameter(name="document", args={}, children="passage: "), - ], - ), - ], - ) - ], -) - -# %% -package.to_files("modernbert") - -# %% -vespa_cloud = VespaCloud( - tenant=tenant_name, - application=application, - key_content=os.getenv("VESPA_TEAM_API_KEY", None), - application_package=package, -) - -# %% -app: Vespa = vespa_cloud.deploy() - -# %% - -dataset_id = "zeta-alpha-ai/NanoMSMARCO" - -dataset = load_dataset(dataset_id, "corpus", split="train", streaming=True) -vespa_feed = dataset.map( - lambda x: { - "id": x["_id"], - "fields": {"text": x["text"], "id": x["_id"]}, - } -) - -# %% -query_ds = load_dataset(dataset_id, "queries", split="train") -qrels = load_dataset(dataset_id, "qrels", split="train") - -# %% -ids_to_query = dict(zip(query_ds["_id"], query_ds["text"])) - -# %% -for idx, (qid, q) in enumerate(ids_to_query.items()): - print(f"qid: {qid}, query: {q}") - if idx == 5: - break - -# %% -relevant_docs = dict(zip(qrels["query-id"], qrels["corpus-id"])) - -# %% -for idx, (qid, doc_id) in enumerate(relevant_docs.items()): - print(f"qid: {qid}, doc_id: {doc_id}") - if idx == 5: - break - - -# %% -def callback(response: VespaResponse, id: str): - if not response.is_successful(): - print(f"Error when feeding document {id}: {response.get_json()}") - - -app.feed_iterable(vespa_feed, schema="doc", namespace="tutorial", callback=callback) - - -# %% -def match_weakand_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str(qb.select("*").from_(schema_name).where(qb.userQuery(query_text))), - "query": query_text, - "ranking": "match-only", - "input.query(q)": f"embed({query_text})", - } - - -def match_hybrid_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str( - qb.select("*") - .from_(schema_name) - .where( - qb.nearestNeighbor( - field="embedding", - query_vector="q", - annotations={"targetHits": 100}, - ) - | qb.userQuery( - query_text, - ) - ) - ), - "query": query_text, - "ranking": "match-only", - "input.query(q)": f"embed({query_text})", - } - - -def match_semantic_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str( - qb.select("*") - .from_(schema_name) - .where( - qb.nearestNeighbor( - field="embedding", - query_vector="q", - annotations={"targetHits": 100}, - ) - ) - ), - "query": query_text, - "ranking": "match-only", - "input.query(q)": f"embed({query_text})", - } - - -# %% -match_results = {} -for evaluator_name, query_fn in [ - ("semantic", match_semantic_query_fn), - ("weakand", match_weakand_query_fn), - ("hybrid", match_hybrid_query_fn), -]: - print(f"Evaluating {evaluator_name}...") - - match_evaluator = VespaMatchEvaluator( - queries=ids_to_query, - relevant_docs=relevant_docs, - vespa_query_fn=query_fn, - app=app, - name="test-run", - id_field="id", # specify the id field used in the relevant_docs - write_csv=True, - write_verbose=True, # optionally write verbose metrics to CSV - ) - - results = match_evaluator() - match_results[evaluator_name] = results - print(f"Results for {evaluator_name}:") - print(results) - -# %% -results = pd.DataFrame(match_results) -results - - -# %% -def semantic_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str( - qb.select("*") - .from_(schema_name) - .where( - qb.nearestNeighbor( - field="embedding", - query_vector="q", - annotations={"targetHits": 100}, - ) - ) - ), - "query": query_text, - "ranking": "semantic", - "input.query(q)": f"embed({query_text})", - "hits": top_k, - } - - -def bm25_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": "select * from sources * where userQuery();", # provide the yql directly as a string - "query": query_text, - "ranking": "bm25", - "hits": top_k, - } - - -def fusion_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str( - qb.select("*") - .from_(schema_name) - .where( - qb.nearestNeighbor( - field="embedding", - query_vector="q", - annotations={"targetHits": 100}, - ) - | qb.userQuery(query_text) - ) - ), - "query": query_text, - "ranking": "fusion", - "input.query(q)": f"embed({query_text})", - "hits": top_k, - } - - -def atan_norm_query_fn(query_text: str, top_k: int) -> dict: - return { - "yql": str( - qb.select("*") - .from_(schema_name) - .where( - qb.nearestNeighbor( - field="embedding", - query_vector="q", - annotations={"targetHits": 100}, - ) - | qb.userQuery(query_text) - ) - ), - "query": query_text, - "ranking": "atan_norm", - "input.query(q)": f"embed({query_text})", - "hits": top_k, - } - - -# %% -all_results = {} -for evaluator_name, query_fn in [ - ("semantic", semantic_query_fn), - ("bm25", bm25_query_fn), - ("fusion", fusion_query_fn), - ("atan_norm", atan_norm_query_fn), -]: - print(f"Evaluating {evaluator_name}...") - evaluator = VespaEvaluator( - queries=ids_to_query, - relevant_docs=relevant_docs, - vespa_query_fn=query_fn, - app=app, - name=evaluator_name, - write_csv=True, # optionally write metrics to CSV - ) - - results = evaluator.run() - all_results[evaluator_name] = results - -# %% -results = pd.DataFrame(all_results) - -# %% -# take out all rows with "searchtime" to a separate dataframe -searchtime = results[results.index.str.contains("searchtime")] -results = results[~results.index.str.contains("searchtime")] - - -# Highlight the maximum value in each row -def highlight_max(s): - is_max = s == s.max() - return ["background-color: lightgreen; color: black;" if v else "" for v in is_max] - - -# Style the DataFrame: Highlight max values and format numbers to 4 decimals -styled_df = results.style.apply(highlight_max, axis=1).format("{:.4f}") -styled_df - -# %% -results.plot(kind="bar", figsize=(12, 6)) - -# %% -searchtime = searchtime * 1000 -searchtime.plot(kind="bar", figsize=(12, 6)).set(ylabel="time (ms)") - -# %% -vespa_cloud.delete() diff --git a/tests/unit/test_nanobeir.py b/tests/unit/test_nanobeir.py new file mode 100644 index 000000000..4362983d1 --- /dev/null +++ b/tests/unit/test_nanobeir.py @@ -0,0 +1,759 @@ +""" +Tests for NanoBEIR evaluation utilities. +""" + +import pytest +from vespa.nanobeir import ( + ModelConfig, + create_embedder_component, + create_embedding_field, + create_semantic_rank_profile, + create_hybrid_rank_profile, + get_model_config, + COMMON_MODELS, +) +from vespa.package import Component, Field, RankProfile +from vespa.configuration.vt import compare_xml + + +class TestModelConfig: + """Test ModelConfig dataclass.""" + + def test_basic_config(self): + """Test basic model configuration.""" + config = ModelConfig( + model_id="test-model", + embedding_dim=384, + ) + assert config.model_id == "test-model" + assert config.embedding_dim == 384 + assert config.tokenizer_id == "test-model" # Defaults to model_id + assert config.binarized is False + assert config.component_id == "test_model" # Hyphens replaced + + def test_config_with_tokenizer(self): + """Test configuration with separate tokenizer.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + tokenizer_id="e5-base-v2-vocab", + ) + assert config.tokenizer_id == "e5-base-v2-vocab" + + def test_config_binarized(self): + """Test binarized model configuration.""" + config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, + binarized=True, + ) + assert config.binarized is True + + def test_config_with_paths(self): + """Test configuration with local paths.""" + config = ModelConfig( + model_id="custom-model", + embedding_dim=512, + model_path="/path/to/model.onnx", + tokenizer_path="/path/to/tokenizer.json", + ) + assert config.model_path == "/path/to/model.onnx" + assert config.tokenizer_path == "/path/to/tokenizer.json" + + def test_config_with_urls(self): + """Test configuration with URLs.""" + config = ModelConfig( + model_id="url-model", + embedding_dim=768, + model_url="https://example.com/model.onnx", + tokenizer_url="https://example.com/tokenizer.json", + ) + assert config.model_url == "https://example.com/model.onnx" + assert config.tokenizer_url == "https://example.com/tokenizer.json" + + def test_config_with_explicit_parameters(self): + """Test configuration with explicit huggingface embedder parameters.""" + config = ModelConfig( + model_id="custom-model", + embedding_dim=768, + max_tokens=8192, + transformer_output="token_embeddings", + pooling_strategy="cls", + normalize=True, + query_prepend="query: ", + document_prepend="passage: ", + ) + assert config.max_tokens == 8192 + assert config.transformer_output == "token_embeddings" + assert config.pooling_strategy == "cls" + assert config.normalize is True + assert config.query_prepend == "query: " + assert config.document_prepend == "passage: " + + def test_config_pooling_strategy_validation(self): + """Test that invalid pooling strategy raises error.""" + with pytest.raises(ValueError, match="pooling_strategy must be one of"): + ModelConfig( + model_id="test", + embedding_dim=384, + pooling_strategy="invalid", + ) + + def test_config_invalid_dimension(self): + """Test that invalid embedding dimension raises error.""" + with pytest.raises(ValueError, match="embedding_dim must be positive"): + ModelConfig(model_id="test", embedding_dim=0) + + with pytest.raises(ValueError, match="embedding_dim must be positive"): + ModelConfig(model_id="test", embedding_dim=-1) + + def test_component_id_sanitization(self): + """Test that component IDs are properly sanitized.""" + config = ModelConfig( + model_id="some/model-with-special_chars", + embedding_dim=384, + ) + # Slashes and hyphens should be replaced with underscores + assert config.component_id == "some_model_with_special_chars" + + +class TestCreateEmbedderComponent: + """Test create_embedder_component function.""" + + def test_component_with_model_id(self): + """Test component creation with model ID.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + tokenizer_id="e5-base-v2-vocab", + ) + component = create_embedder_component(config) + + assert isinstance(component, Component) + assert component.id == "e5_small_v2" + assert component.type == "hugging-face-embedder" + assert len(component.parameters) == 1 + + # Check transformer-model parameter + assert component.parameters[0].name == "transformer-model" + assert component.parameters[0].args == {"model-id": "e5-small-v2"} + + def test_component_with_paths(self): + """Test component creation with file paths.""" + config = ModelConfig( + model_id="custom-model", + embedding_dim=384, + model_path="/models/custom.onnx", + tokenizer_path="/models/tokenizer.json", + ) + component = create_embedder_component(config) + + assert component.parameters[0].args == {"path": "/models/custom.onnx"} + assert component.parameters[1].args == {"path": "/models/tokenizer.json"} + + def test_component_with_urls(self): + """Test component creation with URLs.""" + config = ModelConfig( + model_id="url-model", + embedding_dim=768, + model_url="https://huggingface.co/model.onnx", + tokenizer_url="https://huggingface.co/tokenizer.json", + ) + component = create_embedder_component(config) + + assert component.parameters[0].args == { + "url": "https://huggingface.co/model.onnx" + } + assert component.parameters[1].args == { + "url": "https://huggingface.co/tokenizer.json" + } + + def test_component_with_explicit_parameters(self): + """Test component creation with explicit huggingface embedder parameters.""" + config = ModelConfig( + model_id="advanced-model", + embedding_dim=768, + max_tokens=8192, + transformer_output="token_embeddings", + pooling_strategy="cls", + normalize=True, + ) + component = create_embedder_component(config) + + # Should have transformer-model, plus 4 explicit parameters + assert len(component.parameters) == 5 + assert component.parameters[1].name == "max-tokens" + assert component.parameters[1].children == "8192" + assert component.parameters[2].name == "transformer-output" + assert component.parameters[2].children == "token_embeddings" + assert component.parameters[3].name == "pooling-strategy" + assert component.parameters[3].children == "cls" + assert component.parameters[4].name == "normalize" + assert component.parameters[4].children == "true" + + def test_component_with_prepend_parameters(self): + """Test component creation with prepend parameters.""" + config = ModelConfig( + model_id="prepend-model", + embedding_dim=768, + model_url="https://example.com/model.onnx", + tokenizer_url="https://example.com/tokenizer.json", + query_prepend="Represent this sentence for searching relevant passages: ", + document_prepend="passage: ", + ) + component = create_embedder_component(config) + + # Should have transformer-model, tokenizer-url plus prepend parameter + assert len(component.parameters) == 3 + prepend_param = component.parameters[2] + assert prepend_param.name == "prepend" + assert isinstance(prepend_param.children, list) + assert len(prepend_param.children) == 2 + assert prepend_param.children[0].name == "query" + assert ( + prepend_param.children[0].children + == "Represent this sentence for searching relevant passages: " + ) + assert prepend_param.children[1].name == "document" + assert prepend_param.children[1].children == "passage: " + + def test_component_with_only_query_prepend(self): + """Test component creation with only query prepend.""" + config = ModelConfig( + model_id="query-prepend-model", + embedding_dim=768, + query_prepend="query: ", + ) + component = create_embedder_component(config) + + # Should have transformer-model plus prepend parameter + assert len(component.parameters) == 2 + prepend_param = component.parameters[1] + assert prepend_param.name == "prepend" + assert len(prepend_param.children) == 1 + assert prepend_param.children[0].name == "query" + + def test_component_url_priority_over_path(self): + """Test that URL takes priority over path when both are provided.""" + config = ModelConfig( + model_id="test-model", + embedding_dim=384, + model_path="/path/to/model.onnx", + model_url="https://example.com/model.onnx", + tokenizer_path="/path/to/tokenizer.json", + tokenizer_url="https://example.com/tokenizer.json", + ) + component = create_embedder_component(config) + + # URLs should take priority + assert component.parameters[0].args == {"url": "https://example.com/model.onnx"} + assert component.parameters[1].args == { + "url": "https://example.com/tokenizer.json" + } + + +class TestCreateEmbeddingField: + """Test create_embedding_field function.""" + + def test_float_embedding_field(self): + """Test field creation for float embeddings.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + binarized=False, + ) + field = create_embedding_field(config) + + assert isinstance(field, Field) + assert field.name == "embedding" + assert field.type == "tensor(x[384])" + assert field.is_document_field is False + + # Check indexing statement includes embedder ID + assert "input text" in field.indexing + assert "embed e5_small_v2" in field.indexing + assert "index" in field.indexing + assert "attribute" in field.indexing + assert "pack_bits" not in field.indexing + + # Check HNSW configuration + assert field.ann is not None + assert field.ann.distance_metric == "angular" + + def test_binarized_embedding_field(self): + """Test field creation for binarized embeddings.""" + config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, + binarized=True, + ) + field = create_embedding_field(config) + + assert field.name == "embedding" + # 1024 bits packed into 128 int8 values + assert field.type == "tensor(x[128])" + + # Check indexing statement includes pack_bits and embedder ID + assert "embed bge_m3" in field.indexing + assert "pack_bits" in field.indexing + + # Check HNSW configuration uses hamming distance + assert field.ann.distance_metric == "hamming" + + def test_custom_field_name(self): + """Test field creation with custom name.""" + config = ModelConfig(model_id="test", embedding_dim=384) + field = create_embedding_field(config, field_name="my_embedding") + + assert field.name == "my_embedding" + + def test_custom_distance_metric(self): + """Test field creation with custom distance metric.""" + config = ModelConfig(model_id="test", embedding_dim=384) + field = create_embedding_field(config, distance_metric="euclidean") + + assert field.ann.distance_metric == "euclidean" + + def test_custom_indexing(self): + """Test field creation with custom indexing.""" + config = ModelConfig(model_id="test", embedding_dim=384) + custom_indexing = ["attribute", "index"] + field = create_embedding_field(config, indexing=custom_indexing) + + assert field.indexing == custom_indexing + + def test_custom_embedder_id(self): + """Test field creation with custom embedder ID.""" + config = ModelConfig(model_id="test", embedding_dim=384) + field = create_embedding_field(config, embedder_id="my_embedder") + + assert "embed my_embedder" in field.indexing + + +class TestCreateSemanticRankProfile: + """Test create_semantic_rank_profile function.""" + + def test_float_semantic_profile(self): + """Test semantic profile for float embeddings.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + binarized=False, + ) + profile = create_semantic_rank_profile(config) + + assert isinstance(profile, RankProfile) + assert profile.name == "semantic" + assert len(profile.inputs) == 1 + assert profile.inputs[0][0] == "query(q)" + assert profile.inputs[0][1] == "tensor(x[384])" + + # Check functions + assert len(profile.functions) == 1 + assert profile.functions[0].name == "similarity" + assert "closeness(field, embedding)" in profile.functions[0].expression + + assert profile.first_phase == "similarity" + assert "similarity" in profile.match_features + + def test_binarized_semantic_profile(self): + """Test semantic profile for binarized embeddings.""" + config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, + binarized=True, + ) + profile = create_semantic_rank_profile(config) + + # Query tensor should be int8 with packed dimensions + assert profile.inputs[0][1] == "tensor(x[128])" + + # Similarity function should handle hamming distance + similarity_func = profile.functions[0] + assert "closeness(field, embedding)" in similarity_func.expression + # Should have transformation for hamming distance + assert "1/(1 + " in similarity_func.expression + + def test_custom_profile_name(self): + """Test semantic profile with custom name.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_semantic_rank_profile(config, profile_name="my_semantic") + + assert profile.name == "my_semantic" + + def test_custom_embedding_field(self): + """Test semantic profile with custom embedding field name.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_semantic_rank_profile( + config, + embedding_field="my_embedding", + ) + + # Check that custom field name is used in expression + assert "my_embedding" in profile.functions[0].expression + + def test_custom_query_tensor(self): + """Test semantic profile with custom query tensor name.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_semantic_rank_profile( + config, + query_tensor="query_embedding", + ) + + assert profile.inputs[0][0] == "query(query_embedding)" + + +class TestCreateHybridRankProfile: + """Test create_hybrid_rank_profile function.""" + + def test_hybrid_profile_rrf(self): + """Test hybrid profile with reciprocal rank fusion.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + binarized=False, + ) + profile = create_hybrid_rank_profile(config) + + assert isinstance(profile, RankProfile) + assert profile.name == "fusion" + assert profile.inherits == "bm25" + + # Check global phase + assert profile.global_phase is not None + assert "reciprocal_rank_fusion" in profile.global_phase.expression + assert "bm25text" in profile.global_phase.expression + assert profile.global_phase.rerank_count == 1000 + + # Check match features includes both + assert "similarity" in profile.match_features + assert "bm25text" in profile.match_features + + def test_hybrid_profile_normalize(self): + """Test hybrid profile with linear normalization.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_hybrid_rank_profile( + config, + fusion_method="normalize", + ) + + assert "normalize_linear" in profile.global_phase.expression + assert "bm25text" in profile.global_phase.expression + + def test_hybrid_profile_binarized(self): + """Test hybrid profile for binarized embeddings.""" + config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, + binarized=True, + ) + profile = create_hybrid_rank_profile(config) + + # Query tensor should be int8 + assert profile.inputs[0][1] == "tensor(x[128])" + + # Similarity function should handle hamming distance + similarity_func = profile.functions[0] + assert "1/(1 + " in similarity_func.expression + + def test_hybrid_profile_invalid_fusion(self): + """Test that invalid fusion method raises error.""" + config = ModelConfig(model_id="test", embedding_dim=384) + + with pytest.raises(ValueError, match="Unknown fusion_method"): + create_hybrid_rank_profile(config, fusion_method="invalid") + + def test_custom_profile_name(self): + """Test hybrid profile with custom name.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_hybrid_rank_profile( + config, + profile_name="my_hybrid", + ) + + assert profile.name == "my_hybrid" + + def test_custom_base_profile(self): + """Test hybrid profile with custom base profile.""" + config = ModelConfig(model_id="test", embedding_dim=384) + profile = create_hybrid_rank_profile( + config, + base_profile="custom_bm25", + ) + + assert profile.inherits == "custom_bm25" + + +class TestPredefinedModels: + """Test predefined model configurations.""" + + def test_common_models_exist(self): + """Test that Vespa Cloud models are defined.""" + assert "nomic-ai-modernbert" in COMMON_MODELS + assert "lightonai-modernbert-large" in COMMON_MODELS + assert "alibaba-gte-modernbert" in COMMON_MODELS + assert "e5-small-v2" in COMMON_MODELS + assert "e5-base-v2" in COMMON_MODELS + assert "e5-large-v2" in COMMON_MODELS + assert "multilingual-e5-base" in COMMON_MODELS + + def test_e5_small_v2_config(self): + """Test e5-small-v2 configuration.""" + config = COMMON_MODELS["e5-small-v2"] + assert config.model_id == "e5-small-v2" + assert config.embedding_dim == 384 + assert config.binarized is False + assert config.max_tokens == 512 + assert config.query_prepend == "query: " + assert config.document_prepend == "passage: " + + def test_nomic_ai_modernbert_config(self): + """Test nomic-ai-modernbert configuration.""" + config = COMMON_MODELS["nomic-ai-modernbert"] + assert config.model_id == "nomic-ai-modernbert" + assert config.embedding_dim == 768 + assert config.binarized is False + assert config.max_tokens == 8192 + assert config.transformer_output == "token_embeddings" + assert config.query_prepend == "search_query: " + assert config.document_prepend == "search_document: " + + def test_get_model_config_success(self): + """Test getting a predefined model config.""" + config = get_model_config("e5-small-v2") + assert config.model_id == "e5-small-v2" + assert config.embedding_dim == 384 + + def test_get_model_config_not_found(self): + """Test that unknown model raises error.""" + with pytest.raises(KeyError, match="Unknown model"): + get_model_config("nonexistent-model") + + # Error message should list available models + try: + get_model_config("nonexistent-model") + except KeyError as e: + assert "Available models" in str(e) + + +class TestIntegration: + """Integration tests combining multiple components.""" + + def test_complete_float_setup(self): + """Test complete setup for float embeddings.""" + config = ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + tokenizer_id="e5-base-v2-vocab", + ) + + component = create_embedder_component(config) + field = create_embedding_field(config) + semantic_profile = create_semantic_rank_profile(config) + hybrid_profile = create_hybrid_rank_profile(config) + + # Verify all components work together + assert component.id == config.component_id + assert field.type == "tensor(x[384])" + assert field.ann.distance_metric == "angular" + assert semantic_profile.inputs[0][1] == "tensor(x[384])" + assert hybrid_profile.inputs[0][1] == "tensor(x[384])" + assert "closeness(field, embedding)" in semantic_profile.functions[0].expression + + def test_complete_binarized_setup(self): + """Test complete setup for binarized embeddings.""" + config = ModelConfig( + model_id="bge-m3", + embedding_dim=1024, + binarized=True, + ) + + component = create_embedder_component(config) + field = create_embedding_field(config) + semantic_profile = create_semantic_rank_profile(config) + hybrid_profile = create_hybrid_rank_profile(config) + + # Verify all components work together + assert component.id == config.component_id + assert field.type == "tensor(x[128])" + assert field.ann.distance_metric == "hamming" + assert "pack_bits" in field.indexing + assert semantic_profile.inputs[0][1] == "tensor(x[128])" + assert hybrid_profile.inputs[0][1] == "tensor(x[128])" + # Hamming distance should be handled specially + assert "1/(1 + " in semantic_profile.functions[0].expression + + def test_complete_advanced_setup(self): + """Test complete setup with URL-based model and explicit parameters.""" + config = ModelConfig( + model_id="gte-multilingual", + embedding_dim=768, + model_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/onnx/model_quantized.onnx", + tokenizer_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/tokenizer.json", + transformer_output="token_embeddings", + max_tokens=8192, + query_prepend="Represent this sentence for searching relevant passages: ", + document_prepend="passage: ", + ) + + component = create_embedder_component(config) + field = create_embedding_field(config) + semantic_profile = create_semantic_rank_profile(config) + hybrid_profile = create_hybrid_rank_profile(config) + + # Verify component configuration + assert component.id == "gte_multilingual" + assert ( + len(component.parameters) == 5 + ) # transformer, tokenizer, max-tokens, transformer-output, prepend + assert component.parameters[0].args["url"] == config.model_url + assert component.parameters[1].args["url"] == config.tokenizer_url + assert component.parameters[2].name == "max-tokens" + assert component.parameters[2].children == "8192" + assert component.parameters[3].name == "transformer-output" + assert component.parameters[3].children == "token_embeddings" + assert component.parameters[4].name == "prepend" + assert len(component.parameters[4].children) == 2 + + # Verify field configuration + assert field.type == "tensor(x[768])" + assert field.ann.distance_metric == "angular" + + # Verify profiles + assert semantic_profile.inputs[0][1] == "tensor(x[768])" + assert hybrid_profile.inputs[0][1] == "tensor(x[768])" + + +class TestCommonModelsXMLGeneration: + """Test that COMMON_MODELS generate XML matching Vespa Cloud documentation.""" + + def test_nomic_ai_modernbert_xml(self): + """Test nomic-ai-modernbert generates correct XML.""" + config = get_model_config("nomic-ai-modernbert") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + token_embeddings + 8192 + + search_query: + search_document: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_lightonai_modernbert_large_xml(self): + """Test lightonai-modernbert-large generates correct XML.""" + config = get_model_config("lightonai-modernbert-large") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 8192 + + search_query: + search_document: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_alibaba_gte_modernbert_xml(self): + """Test alibaba-gte-modernbert generates correct XML.""" + config = get_model_config("alibaba-gte-modernbert") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 8192 + cls +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_e5_small_v2_xml(self): + """Test e5-small-v2 generates correct XML.""" + config = get_model_config("e5-small-v2") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 512 + + query: + passage: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_e5_base_v2_xml(self): + """Test e5-base-v2 generates correct XML.""" + config = get_model_config("e5-base-v2") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 512 + + query: + passage: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_e5_large_v2_xml(self): + """Test e5-large-v2 generates correct XML.""" + config = get_model_config("e5-large-v2") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 512 + + query: + passage: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" + + def test_multilingual_e5_base_xml(self): + """Test multilingual-e5-base generates correct XML.""" + config = get_model_config("multilingual-e5-base") + component = create_embedder_component(config) + xml = component.to_xml_string(indent=1) + + expected = """ + + 512 + + query: + passage: + +""" + + assert compare_xml( + xml, expected + ), f"XML mismatch:\nGot:\n{xml}\n\nExpected:\n{expected}" diff --git a/vespa/nanobeir.py b/vespa/nanobeir.py new file mode 100644 index 000000000..3ed766cee --- /dev/null +++ b/vespa/nanobeir.py @@ -0,0 +1,707 @@ +""" +NanoBEIR evaluation utilities for Vespa. + +This module provides utilities to easily configure and run NanoBEIR evaluations +for different embedding models, handling differences in model dimensions, +tokenizers, and binary vs. float embeddings. +""" + +from dataclasses import dataclass +from typing import List, Optional, Dict, Union +from vespa.package import ( + ApplicationPackage, + Component, + Parameter, + Field, + HNSW, + RankProfile, + Function, + Schema, + Document, + FieldSet, +) + + +@dataclass +class ModelConfig: + """ + Configuration for an embedding model. + + This class encapsulates all model-specific parameters that affect + the Vespa schema, component configuration, and ranking expressions. + + Attributes: + model_id: The model identifier (e.g., 'e5-small-v2', 'snowflake-arctic-embed-xs') + embedding_dim: The dimension of the embedding vectors (e.g., 384, 768) + tokenizer_id: The tokenizer model identifier (if different from model_id) + binarized: Whether the embeddings are binarized (packed bits) + component_id: The ID to use for the Vespa component (defaults to sanitized model_id) + model_path: Optional local path to the model file + tokenizer_path: Optional local path to the tokenizer file + model_url: Optional URL to the ONNX model file (alternative to model_id) + tokenizer_url: Optional URL to the tokenizer file (alternative to tokenizer_id) + max_tokens: Maximum number of tokens accepted by the transformer model (default: 512) + transformer_input_ids: Name/identifier for transformer input IDs (default: "input_ids") + transformer_attention_mask: Name/identifier for transformer attention mask (default: "attention_mask") + transformer_token_type_ids: Name/identifier for transformer token type IDs (default: "token_type_ids") + Set to None to disable token_type_ids + transformer_output: Name/identifier for transformer output (default: "last_hidden_state") + pooling_strategy: How to pool output vectors ("mean", "cls", or "none") (default: "mean") + normalize: Whether to normalize output to unit length (default: False) + query_prepend: Optional instruction to prepend to query text + document_prepend: Optional instruction to prepend to document text + """ + + model_id: str + embedding_dim: int + tokenizer_id: Optional[str] = None + binarized: bool = False + component_id: Optional[str] = None + model_path: Optional[str] = None + tokenizer_path: Optional[str] = None + model_url: Optional[str] = None + tokenizer_url: Optional[str] = None + max_tokens: Optional[int] = None + transformer_input_ids: Optional[str] = None + transformer_attention_mask: Optional[str] = None + transformer_token_type_ids: Optional[str] = None + transformer_output: Optional[str] = None + pooling_strategy: Optional[str] = None + normalize: Optional[bool] = None + query_prepend: Optional[str] = None + document_prepend: Optional[str] = None + + def __post_init__(self): + """Set defaults and validate configuration.""" + if self.tokenizer_id is None: + # Use the same ID for tokenizer if not specified + self.tokenizer_id = self.model_id + + if self.component_id is None: + # Create a component ID from model_id by replacing hyphens, slashes, and dots with underscores + # This ensures the ID is a valid Vespa field name: [a-zA-Z]\w* + self.component_id = ( + self.model_id.replace("-", "_").replace("/", "_").replace(".", "_") + ) + + # Validate embedding dimension + if self.embedding_dim <= 0: + raise ValueError( + f"embedding_dim must be positive, got {self.embedding_dim}" + ) + + # Validate pooling strategy + if self.pooling_strategy is not None: + valid_strategies = ["mean", "cls", "none"] + if self.pooling_strategy not in valid_strategies: + raise ValueError( + f"pooling_strategy must be one of {valid_strategies}, got {self.pooling_strategy}" + ) + + +def create_embedder_component(config: ModelConfig) -> Component: + """ + Create a Vespa hugging-face-embedder component from a model configuration. + + Args: + config: ModelConfig instance with model parameters + + Returns: + Component: A Vespa Component configured as a hugging-face-embedder + + Example: + >>> config = ModelConfig(model_id="e5-small-v2", embedding_dim=384) + >>> component = create_embedder_component(config) + >>> component.id + 'e5_small_v2' + + >>> # Example with URL-based model and custom parameters + >>> config = ModelConfig( + ... model_id="gte-multilingual", + ... embedding_dim=768, + ... model_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/onnx/model_quantized.onnx", + ... tokenizer_url="https://huggingface.co/onnx-community/gte-multilingual-base/resolve/main/tokenizer.json", + ... transformer_output="token_embeddings", + ... max_tokens=8192, + ... query_prepend="Represent this sentence for searching relevant passages: ", + ... document_prepend="passage: ", + ... ) + >>> component = create_embedder_component(config) + >>> component.id + 'gte_multilingual' + """ + parameters = [] + + # Add transformer model parameter + if config.model_url: + transformer_config = {"url": config.model_url} + elif config.model_path: + transformer_config = {"path": config.model_path} + else: + transformer_config = {"model-id": config.model_id} + parameters.append(Parameter("transformer-model", transformer_config)) + + # Add tokenizer model parameter + if config.tokenizer_url: + tokenizer_config = {"url": config.tokenizer_url} + elif config.tokenizer_path: + tokenizer_config = {"path": config.tokenizer_path} + else: + tokenizer_config = None + if tokenizer_config is not None: + parameters.append(Parameter("tokenizer-model", tokenizer_config)) + # Add optional huggingface embedder parameters + if config.max_tokens is not None: + parameters.append( + Parameter("max-tokens", args={}, children=str(config.max_tokens)) + ) + + if config.transformer_input_ids is not None: + parameters.append( + Parameter( + "transformer-input-ids", args={}, children=config.transformer_input_ids + ) + ) + + if config.transformer_attention_mask is not None: + parameters.append( + Parameter( + "transformer-attention-mask", + args={}, + children=config.transformer_attention_mask, + ) + ) + + if config.transformer_token_type_ids is not None: + # Empty element to disable token_type_ids + if config.transformer_token_type_ids == "": + parameters.append( + Parameter("transformer-token-type-ids", args={}, children=None) + ) + else: + parameters.append( + Parameter( + "transformer-token-type-ids", + args={}, + children=config.transformer_token_type_ids, + ) + ) + + if config.transformer_output is not None: + parameters.append( + Parameter("transformer-output", args={}, children=config.transformer_output) + ) + + if config.pooling_strategy is not None: + parameters.append( + Parameter("pooling-strategy", args={}, children=config.pooling_strategy) + ) + + if config.normalize is not None: + parameters.append( + Parameter("normalize", args={}, children=str(config.normalize).lower()) + ) + + # Add prepend instructions if specified + if config.query_prepend is not None or config.document_prepend is not None: + prepend_children = [] + if config.query_prepend is not None: + prepend_children.append( + Parameter("query", args={}, children=config.query_prepend) + ) + if config.document_prepend is not None: + prepend_children.append( + Parameter("document", args={}, children=config.document_prepend) + ) + parameters.append(Parameter("prepend", args={}, children=prepend_children)) + + return Component( + id=config.component_id, + type="hugging-face-embedder", + parameters=parameters, + ) + + +def create_embedding_field( + config: ModelConfig, + field_name: str = "embedding", + indexing: Optional[List[str]] = None, + distance_metric: Optional[str] = None, + embedder_id: Optional[str] = None, +) -> Field: + """ + Create a Vespa embedding field from a model configuration. + + The field type and indexing statement are automatically configured based on + whether the embeddings are binarized. + + Args: + config: ModelConfig instance with model parameters + field_name: Name of the embedding field (default: "embedding") + indexing: Custom indexing statement (default: auto-generated based on config) + distance_metric: Distance metric for HNSW (default: "hamming" for binarized, "angular" for float) + embedder_id: Embedder ID to use in the indexing statement (default: uses config.component_id) + + Returns: + Field: A Vespa Field configured for embeddings + + Example: + >>> config = ModelConfig(model_id="e5-small-v2", embedding_dim=384, binarized=False) + >>> field = create_embedding_field(config) + >>> field.type + 'tensor(x[384])' + + >>> config_binary = ModelConfig(model_id="bge-m3", embedding_dim=1024, binarized=True) + >>> field_binary = create_embedding_field(config_binary) + >>> field_binary.type + 'tensor(x[128])' + """ + # Determine embedder ID to use + embedder_id = embedder_id or config.component_id + + # Determine field type based on binarization + if config.binarized: + # For binarized embeddings, we pack 8 bits into each int8 + packed_dim = config.embedding_dim // 8 + field_type = f"tensor(x[{packed_dim}])" + default_distance_metric = "hamming" + + # Default indexing for binarized: pack bits and index + if indexing is None: + indexing = [ + "input text", + f"embed {embedder_id}", + "pack_bits", + "index", + "attribute", + ] + else: + # Regular float embeddings + field_type = f"tensor(x[{config.embedding_dim}])" + default_distance_metric = "angular" + + # Default indexing for float embeddings + if indexing is None: + indexing = [ + "input text", + f"embed {embedder_id}", + "index", + "attribute", + ] + + # Use provided distance metric or default + distance_metric = distance_metric or default_distance_metric + + return Field( + name=field_name, + type=field_type, + indexing=indexing, + ann=HNSW(distance_metric=distance_metric), + is_document_field=False, + ) + + +def create_semantic_rank_profile( + config: ModelConfig, + profile_name: str = "semantic", + embedding_field: str = "embedding", + query_tensor: str = "q", +) -> RankProfile: + """ + Create a semantic ranking profile based on model configuration. + + The ranking expression is automatically configured to use hamming distance + for binarized embeddings or cosine similarity for float embeddings. + + Args: + config: ModelConfig instance with model parameters + profile_name: Name of the rank profile (default: "semantic") + embedding_field: Name of the embedding field (default: "embedding") + query_tensor: Name of the query tensor (default: "q") + + Returns: + RankProfile: A Vespa RankProfile configured for semantic search + + Example: + >>> config = ModelConfig(model_id="e5-small-v2", embedding_dim=384, binarized=False) + >>> profile = create_semantic_rank_profile(config) + >>> profile.name + 'semantic' + """ + # Determine tensor type for query input + if config.binarized: + packed_dim = config.embedding_dim // 8 + tensor_type = f"tensor(x[{packed_dim}])" + + # For binarized, use hamming distance + # Note: closeness() with hamming distance returns similarity (lower is more similar) + # We use negation or subtraction to convert to a score where higher is better + similarity_expr = f"1/(1 + closeness(field, {embedding_field}))" + else: + tensor_type = f"tensor(x[{config.embedding_dim}])" + + # For float embeddings, use angular distance (cosine similarity) + similarity_expr = f"closeness(field, {embedding_field})" + + return RankProfile( + name=profile_name, + inputs=[(f"query({query_tensor})", tensor_type)], + functions=[Function(name="similarity", expression=similarity_expr)], + first_phase="similarity", + match_features=["similarity"], + ) + + +def create_hybrid_rank_profile( + config: ModelConfig, + profile_name: str = "fusion", + base_profile: str = "bm25", + embedding_field: str = "embedding", + query_tensor: str = "q", + fusion_method: str = "rrf", +) -> RankProfile: + """ + Create a hybrid ranking profile combining BM25 and semantic search. + + Args: + config: ModelConfig instance with model parameters + profile_name: Name of the rank profile (default: "fusion") + base_profile: Name of the BM25 profile to inherit from (default: "bm25") + embedding_field: Name of the embedding field (default: "embedding") + query_tensor: Name of the query tensor (default: "q") + fusion_method: Fusion method - "rrf" for reciprocal rank fusion or "normalize" for linear normalization + + Returns: + RankProfile: A Vespa RankProfile configured for hybrid search + + Example: + >>> config = ModelConfig(model_id="e5-small-v2", embedding_dim=384) + >>> profile = create_hybrid_rank_profile(config) + >>> profile.name + 'fusion' + """ + # Import GlobalPhaseRanking here to avoid circular dependency + from vespa.package import GlobalPhaseRanking + + # Determine tensor type for query input + if config.binarized: + packed_dim = config.embedding_dim // 8 + tensor_type = f"tensor(x[{packed_dim}])" + similarity_expr = f"1/(1 + closeness(field, {embedding_field}))" + else: + tensor_type = f"tensor(x[{config.embedding_dim}])" + similarity_expr = f"closeness(field, {embedding_field})" + + # Choose global phase expression based on fusion method + if fusion_method == "rrf": + global_expr = ( + f"reciprocal_rank_fusion(bm25text, closeness(field, {embedding_field}))" + ) + elif fusion_method == "normalize": + # Use linear normalization + global_expr = ( + f"normalize_linear(bm25text) + normalize_linear({similarity_expr})" + ) + else: + raise ValueError( + f"Unknown fusion_method: {fusion_method}. Use 'rrf' or 'normalize'" + ) + + return RankProfile( + name=profile_name, + inherits=base_profile, + inputs=[(f"query({query_tensor})", tensor_type)], + functions=[Function(name="similarity", expression=similarity_expr)], + first_phase="similarity", + global_phase=GlobalPhaseRanking( + expression=global_expr, + rerank_count=1000, + ), + match_features=["similarity", "bm25text"], + ) + + +# Predefined model configurations for Vespa Cloud models +# Based on https://cloud.vespa.ai/en/model-hub +COMMON_MODELS: Dict[str, ModelConfig] = { + # Huggingface Embedder models + "nomic-ai-modernbert": ModelConfig( + model_id="nomic-ai-modernbert", + embedding_dim=768, + binarized=False, + transformer_output="token_embeddings", + max_tokens=8192, + query_prepend="search_query: ", + document_prepend="search_document: ", + ), + "lightonai-modernbert-large": ModelConfig( + model_id="lightonai-modernbert-large", + embedding_dim=1024, + binarized=False, + max_tokens=8192, + query_prepend="search_query: ", + document_prepend="search_document: ", + ), + "alibaba-gte-modernbert": ModelConfig( + model_id="alibaba-gte-modernbert", + embedding_dim=768, + binarized=False, + max_tokens=8192, + pooling_strategy="cls", + ), + "e5-small-v2": ModelConfig( + model_id="e5-small-v2", + embedding_dim=384, + binarized=False, + max_tokens=512, + query_prepend="query: ", + document_prepend="passage: ", + ), + "e5-base-v2": ModelConfig( + model_id="e5-base-v2", + embedding_dim=768, + binarized=False, + max_tokens=512, + query_prepend="query: ", + document_prepend="passage: ", + ), + "e5-large-v2": ModelConfig( + model_id="e5-large-v2", + embedding_dim=1024, + binarized=False, + max_tokens=512, + query_prepend="query: ", + document_prepend="passage: ", + ), + "multilingual-e5-base": ModelConfig( + model_id="multilingual-e5-base", + embedding_dim=768, + binarized=False, + max_tokens=512, + query_prepend="query: ", + document_prepend="passage: ", + ), +} + + +def get_model_config(model_name: str) -> ModelConfig: + """ + Get a predefined model configuration by name. + + Args: + model_name: Name of a predefined model + + Returns: + ModelConfig: The model configuration + + Raises: + KeyError: If the model name is not found + + Example: + >>> config = get_model_config("e5-small-v2") + >>> config.embedding_dim + 384 + """ + if model_name not in COMMON_MODELS: + available = ", ".join(COMMON_MODELS.keys()) + raise KeyError(f"Unknown model '{model_name}'. Available models: {available}") + return COMMON_MODELS[model_name] + + +def create_evaluation_package( + models: Union[str, ModelConfig, List[Union[str, ModelConfig]]], + app_name: str = "nanobeir_eval", + schema_name: str = "doc", +) -> ApplicationPackage: + """ + Create a Vespa application package configured for NanoBEIR evaluation. + + This function creates a complete Vespa application package with all necessary + components, fields, and rank profiles for evaluation. It supports single or + multiple embedding models, automatically handling naming conflicts by using + model-specific field and component names. + + Args: + models: Single model or list of models to configure. Each can be: + - A string model name (e.g., "e5-small-v2") to use a predefined config + - A ModelConfig instance for custom configuration + app_name: Name of the application (default: "nanobeir_eval") + schema_name: Name of the schema (default: "doc") + + Returns: + ApplicationPackage: Configured Vespa application package with: + - Components for each embedding model + - Embedding fields for each model (named "embedding" for single model, + "embedding_{component_id}" for multiple models) + - BM25 and semantic rank profiles for each model + - Hybrid rank profiles (RRF and normalize fusion) for each model + - A match-only rank profile for baseline evaluation + + Raises: + ValueError: If models list is empty + KeyError: If a model name is not found in COMMON_MODELS + + Example: + >>> # Single model by name + >>> package = create_evaluation_package("e5-small-v2") + >>> len(package.components) + 1 + >>> package.schema.document.fields[2].name + 'embedding' + + >>> # Single model with custom config + >>> config = ModelConfig(model_id="my-model", embedding_dim=512) + >>> package = create_evaluation_package(config) + >>> package.schema.document.fields[2].name + 'embedding' + + >>> # Multiple models - creates separate fields and profiles for each + >>> package = create_evaluation_package(["e5-small-v2", "e5-base-v2"]) + >>> len(package.components) + 2 + >>> # Fields will be named: embedding_e5_small_v2, embedding_e5_base_v2 + >>> field_names = [f.name for f in package.schema.document.fields if f.name.startswith('embedding')] + >>> len(field_names) + 2 + + >>> # Multiple models with mixed configs + >>> custom = ModelConfig(model_id="custom-model", embedding_dim=384) + >>> package = create_evaluation_package(["e5-small-v2", custom]) + >>> len(package.components) + 2 + """ + # Normalize input to a list of ModelConfig objects + if isinstance(models, (str, ModelConfig)): + model_configs = [models] + else: + model_configs = list(models) + + if not model_configs: + raise ValueError("At least one model must be provided") + + # Convert string model names to ModelConfig objects + resolved_configs = [] + for model in model_configs: + if isinstance(model, str): + resolved_configs.append(get_model_config(model)) + else: + resolved_configs.append(model) + + # Determine if we have multiple models (affects naming) + is_multi_model = len(resolved_configs) > 1 + + # Collect all components and fields + all_components = [] + all_embedding_fields = [] + all_rank_profiles = [] + + # Track first embedding field type for match-only profile + first_embedding_type = None + # Track all query inputs for match-only profile (needed for multi-model) + match_only_inputs = [] + + for config in resolved_configs: + # Create unique identifiers for multi-model setup + if is_multi_model: + embedding_field_name = f"embedding_{config.component_id}" + profile_suffix = f"_{config.component_id}" + else: + embedding_field_name = "embedding" + profile_suffix = "" + + # Create the embedder component + embedder = create_embedder_component(config) + all_components.append(embedder) + + # Create the embedding field with correct type and indexing + embedding_field = create_embedding_field( + config, field_name=embedding_field_name, embedder_id=config.component_id + ) + all_embedding_fields.append(embedding_field) + + # Store first embedding type for match-only profile + if first_embedding_type is None: + first_embedding_type = embedding_field.type + + # Add query input for this model to match-only profile + match_only_inputs.append((f"query(q{profile_suffix})", embedding_field.type)) + + # Create base BM25 rank profile + bm25_profile = RankProfile( + name=f"bm25{profile_suffix}", + inputs=[(f"query(q{profile_suffix})", embedding_field.type)], + functions=[Function(name="bm25text", expression="bm25(text)")], + first_phase="bm25text", + match_features=["bm25text"], + ) + all_rank_profiles.append(bm25_profile) + + # Create semantic search rank profile + semantic_profile = create_semantic_rank_profile( + config, + profile_name=f"semantic{profile_suffix}", + embedding_field=embedding_field_name, + query_tensor=f"q{profile_suffix}", + ) + all_rank_profiles.append(semantic_profile) + + # Create hybrid rank profiles + fusion_profile = create_hybrid_rank_profile( + config, + profile_name=f"fusion{profile_suffix}", + base_profile=f"bm25{profile_suffix}", + embedding_field=embedding_field_name, + query_tensor=f"q{profile_suffix}", + fusion_method="rrf", + ) + all_rank_profiles.append(fusion_profile) + + atan_norm_profile = create_hybrid_rank_profile( + config, + profile_name=f"atan_norm{profile_suffix}", + base_profile=f"bm25{profile_suffix}", + embedding_field=embedding_field_name, + query_tensor=f"q{profile_suffix}", + fusion_method="normalize", + ) + all_rank_profiles.append(atan_norm_profile) + + # Create a match-only profile with inputs for all models + match_only_profile = RankProfile( + name="match-only", + inputs=match_only_inputs, + first_phase="random", + ) + + # Build the schema with all fields + schema = Schema( + name=schema_name, + document=Document( + fields=[ + Field( + name="id", + type="string", + indexing=["summary", "attribute"], + ), + Field( + name="text", + type="string", + indexing=["index", "summary"], + index="enable-bm25", + bolding=True, + ), + ] + + all_embedding_fields + ), + fieldsets=[FieldSet(name="default", fields=["text"])], + rank_profiles=[match_only_profile] + all_rank_profiles, + ) + + # Create the application package + package = ApplicationPackage( + name=app_name, + schema=[schema], + components=all_components, + ) + + return package