Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions data/scripts/approach1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os
import pickle
import tempfile
from pathlib import Path
import pandas as pd
from rapidfuzz import process
from tqdm import tqdm
import signal

RECIPES_CSV = "3A2M.csv"
FOOD_CSV = "MM-Food-100K.csv"

MATCH_CHECKPOINT = "matches_checkpoint.pkl"
PROGRESS_STATE = "progress_state.pkl"
FINAL_CSV = "final_df.csv"
SAVE_EVERY = 500
ATOMIC_TMP = ".tmp_save"
SCORE_CUTOFF = 95

def atomic_save(obj, path: Path):
tmp = path.with_suffix(path.suffix + ATOMIC_TMP)
with open(tmp, "wb") as f:
pickle.dump(obj, f)
os.replace(tmp, path)

def atomic_save_csv(df: pd.DataFrame, path: Path):
tmp = path.with_suffix(path.suffix + ATOMIC_TMP)
df.to_csv(tmp, index=False)
os.replace(tmp, path)

def load_pickle_if_exists(path: Path, default):
if path.exists():
with open(path, "rb") as f:
return pickle.load(f)
return default

shutdown = False
def _handle_signal(sig, frame):
global shutdown
shutdown = True
signal.signal(signal.SIGINT, _handle_signal)

recipes_df = pd.read_csv(RECIPES_CSV)
food_df = pd.read_csv(FOOD_CSV)

recipes_sample = recipes_df
food_sample = food_df

print("Recipes dataset:", recipes_sample.shape)
print("Food dataset:", food_sample.shape)

food_names = food_sample['dish_name'].dropna().unique()
food_names = [str(n).strip().lower() for n in food_names]

matches_path = Path(MATCH_CHECKPOINT)
state_path = Path(PROGRESS_STATE)
final_path = Path(FINAL_CSV)

matches = load_pickle_if_exists(matches_path, default=[])
state = load_pickle_if_exists(state_path, default={"last_idx": 0})

start_idx = state.get("last_idx", 0)
print(f"Resuming matching from index {start_idx}. Already found {len(matches)} matches.")

titles = recipes_sample['title'].dropna().unique()
total_titles = len(titles)

for offset, raw_title in enumerate(titles[start_idx:], start=start_idx):
if shutdown:
print("Shutdown requested; saving state and exiting matching loop.")
state['last_idx'] = offset
atomic_save(state, state_path)
atomic_save(matches, matches_path)
break

title = str(raw_title).strip().lower()
match = process.extractOne(title, food_names, score_cutoff=SCORE_CUTOFF)
if match is not None:
best_match, score, _ = match
matches.append((raw_title, best_match, score))

if len(matches) % SAVE_EVERY == 0:
print(f"[Checkpoint] Found {len(matches)} matches. Saving...")
state['last_idx'] = offset + 1
atomic_save(matches, matches_path)
atomic_save(state, state_path)

else:
state['last_idx'] = total_titles
atomic_save(matches, matches_path)
atomic_save(state, state_path)
print("Completed matching loop. Saved final matches checkpoint.")

matches = load_pickle_if_exists(matches_path, default=matches)
print(f"Total matches to process into final dataset: {len(matches)}")

if final_path.exists():
existing_df = pd.read_csv(final_path)
processed_count = len(existing_df)
final_data = existing_df.to_dict(orient="records")
print(f"Found existing final CSV with {processed_count} rows. Resuming from there.")
else:
final_data = []
processed_count = 0

for idx, (title, dish_name_lower, score) in enumerate(tqdm(matches, desc="Building final dataset"), start=0):
if idx < processed_count:
continue

rec_rows = recipes_sample[recipes_sample['title'].str.strip().str.lower() == str(title).strip().lower()]
if rec_rows.empty:
recipe_row = {}
else:
recipe_row = rec_rows.iloc[0]

food_rows = food_sample[food_sample['dish_name'].str.strip().str.lower() == dish_name_lower]
if food_rows.empty:
food_row = {}
else:
food_row = food_rows.iloc[0]

raw_url = food_row.get('image_url', None) if isinstance(food_row, pd.Series) else None
image_file = raw_url.replace("https://file.b18a.io/", "") if raw_url else None

final_data.append({
"dish_name(Recipe)": title,
"dish_name(MMFood)": dish_name_lower,
"file_path": image_file,
"recipe": recipe_row.get('directions', None) if isinstance(recipe_row, pd.Series) else None,
"ingredients(Recipe)": recipe_row.get('NER', None) if isinstance(recipe_row, pd.Series) else None,
"ingredients(MMFood)": food_row.get('ingredients', None) if isinstance(food_row, pd.Series) else None,
"nutritional_profile": food_row.get('nutritional_profile', None) if isinstance(food_row, pd.Series) else None,
"cooking_method": food_row.get('cooking_method', None) if isinstance(food_row, pd.Series) else None,
"food_type": food_row.get('food_type', None) if isinstance(food_row, pd.Series) else None,
"genre": recipe_row.get('genre', None) if isinstance(recipe_row, pd.Series) else None,
"score": score
})

if (idx + 1) % SAVE_EVERY == 0:
print(f"[Final CSV checkpoint] Saving {len(final_data)} rows to {FINAL_CSV}")
df_partial = pd.DataFrame(final_data)
atomic_save_csv(df_partial, final_path)

if shutdown:
print("Shutdown requested; saving final CSV and exiting.")
df_partial = pd.DataFrame(final_data)
atomic_save_csv(df_partial, final_path)
break

df_final = pd.DataFrame(final_data)
atomic_save_csv(df_final, final_path)
print(f"Final dataset saved to {FINAL_CSV} ({len(df_final)} rows).")
166 changes: 166 additions & 0 deletions data/scripts/embd_approach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import pandas as pd
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def create_faiss_food_merger():
print("Loading CSV files...")
mm_food_df = pd.read_csv('MM-Food-100K.csv')
a2m_df = pd.read_csv('3A2M.csv')

print(f"MM Food dataset shape: {mm_food_df.shape}")
print(f"3A2M dataset shape: {a2m_df.shape}")

mm_food_names = mm_food_df['dish_name'].fillna('').astype(str).tolist()
a2m_names = a2m_df['title'].fillna('').astype(str).tolist()

print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dim = 384

print("Creating embeddings for MM Food dishes...")
mm_embeddings = []
for name in tqdm(mm_food_names, desc="MM Food embeddings"):
embedding = model.encode(name)
mm_embeddings.append(embedding)

mm_embeddings = np.array(mm_embeddings).astype('float32')
faiss.normalize_L2(mm_embeddings)

print("Building FAISS index for MM Food...")
mm_index = faiss.IndexFlatIP(embedding_dim)
mm_index.add(mm_embeddings)

print("Saving MM Food FAISS index...")
faiss.write_index(mm_index, 'mm_food_index.faiss')
mm_metadata = {
'names': mm_food_names,
'dataframe_indices': list(range(len(mm_food_names)))
}
with open('mm_food_metadata.pkl', 'wb') as f:
pickle.dump(mm_metadata, f)

print("Creating embeddings for 3A2M titles...")
a2m_embeddings = []
for name in tqdm(a2m_names, desc="3A2M embeddings"):
embedding = model.encode(name)
a2m_embeddings.append(embedding)

a2m_embeddings = np.array(a2m_embeddings).astype('float32')
faiss.normalize_L2(a2m_embeddings)

print("Building FAISS index for 3A2M...")
a2m_index = faiss.IndexFlatIP(embedding_dim)
a2m_index.add(a2m_embeddings)

print("Saving 3A2M FAISS index...")
faiss.write_index(a2m_index, 'a2m_index.faiss')
a2m_metadata = {
'names': a2m_names,
'dataframe_indices': list(range(len(a2m_names)))
}
with open('a2m_metadata.pkl', 'wb') as f:
pickle.dump(a2m_metadata, f)

print("Finding closest matches for each 3A2M item...")
final_rows = []
threshold = 0.99
matches_above_threshold = 0

batch_size = 100
for i in tqdm(range(0, len(a2m_embeddings), batch_size), desc="Processing matches"):
batch_end = min(i + batch_size, len(a2m_embeddings))
batch_embeddings = a2m_embeddings[i:batch_end]

similarities, indices = mm_index.search(batch_embeddings, 1)

for j, (similarity, mm_idx) in enumerate(zip(similarities, indices)):
original_a2m_idx = i + j
similarity_score = float(similarity[0])
mm_match_idx = int(mm_idx[0])

if similarity_score >= threshold:
matches_above_threshold += 1

a2m_row = a2m_df.iloc[original_a2m_idx].to_dict()
mm_row = mm_food_df.iloc[mm_match_idx].to_dict()

merged_row = {}

for col, val in a2m_row.items():
merged_row[f'3A2M_{col}'] = val

for col, val in mm_row.items():
merged_row[f'MM_Food_{col}'] = val

merged_row['similarity_score'] = similarity_score
merged_row['above_threshold'] = similarity_score >= threshold

final_rows.append(merged_row)


print(f"Total matches processed: {len(final_rows)}")
print(f"Matches above threshold {threshold}: {matches_above_threshold}")

final_df = pd.DataFrame(final_rows)

print(f"Final dataset shape: {final_df.shape}")
print(f"Columns in final dataset: {len(final_df.columns)}")

final_df.to_csv('merged_food_dataset_faiss.csv', index=False)
print("Saved complete merged dataset to 'merged_food_dataset_faiss.csv'")

high_similarity_df = final_df[final_df['above_threshold'] == True]
if len(high_similarity_df) > 0:
high_similarity_df.to_csv('high_similarity_matches.csv', index=False)
print(f"Saved {len(high_similarity_df)} high similarity matches to 'high_similarity_matches.csv'")

return final_df

def search_food_similarity(query_food_name, dataset='mm_food', top_k=1):
if dataset == 'mm_food':
index_file = 'mm_food_index.faiss'
metadata_file = 'mm_food_metadata.pkl'
else:
index_file = 'a2m_index.faiss'
metadata_file = 'a2m_metadata.pkl'

if not os.path.exists(index_file):
print(f"Index file {index_file} not found. Run create_faiss_food_merger() first.")
return None

model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index(index_file)

with open(metadata_file, 'rb') as f:
metadata = pickle.load(f)

query_embedding = model.encode(query_food_name).astype('float32').reshape(1, -1)
faiss.normalize_L2(query_embedding)

similarities, indices = index.search(query_embedding, top_k)

results = []
for sim, idx in zip(similarities[0], indices[0]):
results.append({
'food_name': metadata['names'][idx],
'similarity': float(sim),
'index': int(idx)
})

return results

if __name__ == "__main__":
merged_df = create_faiss_food_merger()

print("\nSample of merged data:")
if len(merged_df) > 0:
print(merged_df[['3A2M_title', 'MM_Food_dish_name', 'similarity_score', 'above_threshold']].head())

print(f"\nTotal rows in final dataset: {len(merged_df)}")
print(f"High similarity matches (>= 0.99): {len(merged_df[merged_df['above_threshold'] == True])}")
print(f"Average similarity score: {merged_df['similarity_score'].mean():.4f}")
Loading