-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembedding_helper.py
More file actions
77 lines (61 loc) · 2.28 KB
/
Copy pathembedding_helper.py
File metadata and controls
77 lines (61 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import open_clip
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from config import CLIP_PATH
# E5 variables:
tok = None
model = None
# Clip variables:
device = None
model_clip = None
tokenizer = None
def initialize_embedding_model(embed_type):
"""Initialize the configured embedding model."""
if embed_type == "E5":
initialize_e5()
elif embed_type == "CLIP":
initialize_clip()
def compute_embeddings(text, embed_type):
"""Compute embeddings for a batch of input texts."""
if embed_type == "E5":
embeddings = compute_e5_embeddings(text)
elif embed_type == "CLIP":
embeddings = compute_clip_embeddings(text)
return embeddings
def initialize_e5() -> None:
"""Load the local E5 tokenizer and model."""
global tok
global model
tok = AutoTokenizer.from_pretrained("./models/e5")
model = AutoModel.from_pretrained("./models/e5")
def compute_e5_embeddings(texts: list[str]) -> np.ndarray:
"""Compute normalized E5 embeddings for a list of texts."""
# E5 expects "query: ..." or "passage: ..."
texts = [f"passage: {t}" for t in texts]
batch = tok(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
out = model(**batch)
emb = (out.last_hidden_state * batch["attention_mask"].unsqueeze(-1)).sum(1)
emb = emb / batch["attention_mask"].sum(1, keepdim=True)
emb = torch.nn.functional.normalize(emb, p=2, dim=1) # cosine
return emb.cpu().numpy() # shape: (N, 768)
def initialize_clip() -> None:
"""Load the local OpenCLIP text model and tokenizer."""
global device
global model_clip
global tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model_clip, _, preprocess = open_clip.create_model_and_transforms(
"ViT-L-14",
pretrained=CLIP_PATH
)
model_clip = model_clip.to(device)
tokenizer = open_clip.get_tokenizer("ViT-L-14")
def compute_clip_embeddings(texts: list[str]) -> np.ndarray:
"""Compute normalized CLIP text embeddings for a list of texts."""
tokens = tokenizer(texts).to(device)
with torch.no_grad():
f = model_clip.encode_text(tokens)
f = torch.nn.functional.normalize(f, dim=-1)
return f.cpu().numpy() # (N, 768)