-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathautoEmbedReRank.py
More file actions
156 lines (137 loc) · 6.04 KB
/
autoEmbedReRank.py
File metadata and controls
156 lines (137 loc) · 6.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# rerank_with_auto_embeddings.py
# don't need key to query but need key to access the re-ranker
import textwrap, math
from collections import defaultdict
from pymongo import MongoClient
import voyageai
# ==== CONFIG ====
VOYAGE_API_KEY = "YOUR_VOYAGE_KEY"
MONGODB_URI = "YOUR_MONGODB_ATLAS_URI"
DB_NAME = "vector_demo"
COLL_NAME = "docs" # collection where auto-embedded docs live
EMBED_PATH = "embedding" # field the index uses for vectors
INDEX_NAME = "vs_index" # your Atlas Vector Search index name
NUM_CANDIDATES = 60 # ANN candidate pool
TOP_K = 6 # how many docs to show/rerank
# ==== CLIENTS ====
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
mongo = MongoClient(MONGODB_URI)
coll = mongo[DB_NAME][COLL_NAME]
# ==== QUERIES ====
no_instruction_query = "What tool is best for remote collaboration?"
instruction_query = (
"Focus on video conferencing and real-time meetings. "
"What tool is best for remote collaboration?"
)
intent_expansions = [
"Best tool for high-quality video conferencing and webinars",
"Real-time meetings with large groups",
"Stable video calls for distributed teams",
"Screen sharing and recording for live sessions",
]
# ==== Vector Search (Stage 1) ====
def embed_query_vec(q: str, model="voyage-3"):
# embed just the query. You can also use voyage-context-3; either works for retrieval
r = vo.embed(model=model, input=q)
return r.embeddings[0]
def atlas_vector_search(query_text: str, top_k=TOP_K, num_candidates=NUM_CANDIDATES):
qv = embed_query_vec(query_text) # query vector
pipeline = [
{"$vectorSearch": {
"index": INDEX_NAME,
"path": EMBED_PATH,
"queryVector": qv,
"numCandidates": num_candidates,
"limit": top_k
}},
{"$project": {
"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}
}}
]
return list(coll.aggregate(pipeline)) # [{text, score}, ...]
# ==== Rerank (Stage 2) ====
def run_rerank(query, docs, model="rerank-2.5", top_k=TOP_K):
# docs = list[str]
resp = vo.rerank(query=query, documents=docs, model=model, top_k=top_k)
return resp.results # each has .document, .index (position in docs), .relevance_score
# ==== Display helpers ====
def ascii_bar(score, width=18):
blocks = "▁▂▃▄▅▆▇█"
n = max(0, min(width, int(round(score * width))))
return blocks[-1] * n or "·"
def show_table(title, results, baseline_docs=None):
print(f"\n=== {title} ===")
print("Rank Score Doc")
print("---- ------- -------------------------------------------------------------")
for i, r in enumerate(results, 1):
bar = ascii_bar(getattr(r, "relevance_score", 0.0))
snippet = textwrap.shorten(r.document, width=60, placeholder="…")
delta = ""
if baseline_docs is not None:
try:
prev = baseline_docs.index(r.document)
delta_rank = prev - (i - 1)
sign = "+" if delta_rank > 0 else ""
delta = f" ({sign}{delta_rank}↑)" if delta_rank != 0 else " (0)"
except ValueError:
pass
print(f"{i:<4} {getattr(r,'relevance_score',0.0):<7.4f} {bar} {snippet}{delta}")
def to_rank_list(results):
# list[(idx, score)] with idx in the original docs array
arr = []
for r in results:
arr.append((getattr(r, "index", None), getattr(r, "relevance_score", 0.0)))
return arr
def rrf_fuse(list_of_rank_lists, k=TOP_K, k_rrf=60):
scores = defaultdict(float)
for ranks in list_of_rank_lists:
for r, (idx, _) in enumerate(ranks, 1):
scores[idx] += 1.0 / (k_rrf + r)
fused = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]
class R: pass
out = []
for idx, s in fused:
rr = R()
rr.document = idx # placeholder; caller will map back to text
rr.relevance_score = s
rr.index = idx
out.append(rr)
return out
# ==== Demo flow ====
def run_demo():
# 1) Retrieve candidates from Atlas (already auto-embedded)
base_hits = atlas_vector_search(no_instruction_query)
inst_hits = atlas_vector_search(instruction_query)
base_docs = [d["text"] for d in base_hits]
inst_docs = [d["text"] for d in inst_hits]
# 2) Rerank those candidates with Voyage
base_rr = run_rerank(no_instruction_query, base_docs, model="rerank-2.5", top_k=TOP_K)
inst_rr = run_rerank(instruction_query, inst_docs, model="rerank-2.5", top_k=TOP_K)
show_table("Atlas Vector Search → rerank-2.5 | Baseline (no instruction)", base_rr)
show_table("Atlas Vector Search → rerank-2.5 | With instruction", inst_rr, baseline_docs=[r.document for r in base_rr])
# 3) Optional: multi-intent fusion over Atlas candidates, then rerank
expanded_docs = []
expanded_ranklists = []
for q in intent_expansions:
hits = atlas_vector_search(q)
docs = [h["text"] for h in hits]
expanded_docs.append(docs) # keep for mapping
rr = run_rerank(q, docs, model="rerank-2.5", top_k=TOP_K)
# convert to (idx, score) relative to a unified pool; easiest is per-query local index
expanded_ranklists.append([(r.index, r.relevance_score) for r in rr])
# Simple fusion over the last candidate set (for demo clarity)
fused = rrf_fuse(expanded_ranklists, k=TOP_K)
# Map fused.index back to text from the last expansion set (demo simplification)
fused = [{
"document": expanded_docs[-1][r.index],
"relevance_score": r.relevance_score
} for r in fused]
print("\n=== RRF Fusion over sub-intents (demo) ===")
print("Rank Score Doc")
print("---- ------- -------------------------------------------------------------")
for i, it in enumerate(fused, 1):
bar = ascii_bar(it["relevance_score"])
snippet = textwrap.shorten(it["document"], width=60, placeholder="…")
print(f"{i:<4} {it['relevance_score']:<7.4f} {bar} {snippet}")
if __name__ == "__main__":
run_demo()