ScaledFocus · SinghSahitya · Oct 23, 2025
diff --git a/data/scripts/approach1.py b/data/scripts/approach1.py
@@ -0,0 +1,152 @@
+import os
+import pickle
+import tempfile
+from pathlib import Path
+import pandas as pd
+from rapidfuzz import process
+from tqdm import tqdm
+import signal
+
+RECIPES_CSV = "3A2M.csv"
+FOOD_CSV    = "MM-Food-100K.csv"
+
+MATCH_CHECKPOINT = "matches_checkpoint.pkl"
+PROGRESS_STATE   = "progress_state.pkl"
+FINAL_CSV        = "final_df.csv"
+SAVE_EVERY = 500
+ATOMIC_TMP = ".tmp_save"
+SCORE_CUTOFF = 95
+
+def atomic_save(obj, path: Path):
+    tmp = path.with_suffix(path.suffix + ATOMIC_TMP)
+    with open(tmp, "wb") as f:
+        pickle.dump(obj, f)
+    os.replace(tmp, path)
+
+def atomic_save_csv(df: pd.DataFrame, path: Path):
+    tmp = path.with_suffix(path.suffix + ATOMIC_TMP)
+    df.to_csv(tmp, index=False)
+    os.replace(tmp, path)
+
+def load_pickle_if_exists(path: Path, default):
+    if path.exists():
+        with open(path, "rb") as f:
+            return pickle.load(f)
+    return default
+
+shutdown = False
+def _handle_signal(sig, frame):
+    global shutdown
+    shutdown = True
+signal.signal(signal.SIGINT, _handle_signal)
+
+recipes_df = pd.read_csv(RECIPES_CSV)
+food_df    = pd.read_csv(FOOD_CSV)
+
+recipes_sample = recipes_df
+food_sample    = food_df
+
+print("Recipes dataset:", recipes_sample.shape)
+print("Food dataset:", food_sample.shape)
+
+food_names = food_sample['dish_name'].dropna().unique()
+food_names = [str(n).strip().lower() for n in food_names]
+
+matches_path = Path(MATCH_CHECKPOINT)
+state_path   = Path(PROGRESS_STATE)
+final_path   = Path(FINAL_CSV)
+
+matches = load_pickle_if_exists(matches_path, default=[])
+state = load_pickle_if_exists(state_path, default={"last_idx": 0})
+
+start_idx = state.get("last_idx", 0)
+print(f"Resuming matching from index {start_idx}. Already found {len(matches)} matches.")
+
+titles = recipes_sample['title'].dropna().unique()
+total_titles = len(titles)
+
+for offset, raw_title in enumerate(titles[start_idx:], start=start_idx):
+    if shutdown:
+        print("Shutdown requested; saving state and exiting matching loop.")
+        state['last_idx'] = offset
+        atomic_save(state, state_path)
+        atomic_save(matches, matches_path)
+        break
+
+    title = str(raw_title).strip().lower()
+    match = process.extractOne(title, food_names, score_cutoff=SCORE_CUTOFF)
+    if match is not None:
+        best_match, score, _ = match
+        matches.append((raw_title, best_match, score))
+
+        if len(matches) % SAVE_EVERY == 0:
+            print(f"[Checkpoint] Found {len(matches)} matches. Saving...")
+            state['last_idx'] = offset + 1
+            atomic_save(matches, matches_path)
+            atomic_save(state, state_path)
+
+else:
+    state['last_idx'] = total_titles
+    atomic_save(matches, matches_path)
+    atomic_save(state, state_path)
+    print("Completed matching loop. Saved final matches checkpoint.")
+
+matches = load_pickle_if_exists(matches_path, default=matches)
+print(f"Total matches to process into final dataset: {len(matches)}")
+
+if final_path.exists():
+    existing_df = pd.read_csv(final_path)
+    processed_count = len(existing_df)
+    final_data = existing_df.to_dict(orient="records")
+    print(f"Found existing final CSV with {processed_count} rows. Resuming from there.")
+else:
+    final_data = []
+    processed_count = 0
+
+for idx, (title, dish_name_lower, score) in enumerate(tqdm(matches, desc="Building final dataset"), start=0):
+    if idx < processed_count:
+        continue
+
+    rec_rows = recipes_sample[recipes_sample['title'].str.strip().str.lower() == str(title).strip().lower()]
+    if rec_rows.empty:
+        recipe_row = {}
+    else:
+        recipe_row = rec_rows.iloc[0]
+
+    food_rows = food_sample[food_sample['dish_name'].str.strip().str.lower() == dish_name_lower]
+    if food_rows.empty:
+        food_row = {}
+    else:
+        food_row = food_rows.iloc[0]
+
+    raw_url = food_row.get('image_url', None) if isinstance(food_row, pd.Series) else None
+    image_file = raw_url.replace("https://file.b18a.io/", "") if raw_url else None
+
+    final_data.append({
+        "dish_name(Recipe)": title,
+        "dish_name(MMFood)": dish_name_lower,
+        "file_path": image_file,
+        "recipe": recipe_row.get('directions', None) if isinstance(recipe_row, pd.Series) else None,
+        "ingredients(Recipe)": recipe_row.get('NER', None) if isinstance(recipe_row, pd.Series) else None,
+        "ingredients(MMFood)": food_row.get('ingredients', None) if isinstance(food_row, pd.Series) else None,
+        "nutritional_profile": food_row.get('nutritional_profile', None) if isinstance(food_row, pd.Series) else None,
+        "cooking_method": food_row.get('cooking_method', None) if isinstance(food_row, pd.Series) else None,
+        "food_type": food_row.get('food_type', None) if isinstance(food_row, pd.Series) else None,
+        "genre": recipe_row.get('genre', None) if isinstance(recipe_row, pd.Series) else None,
+        "score": score
+    })
+
+    if (idx + 1) % SAVE_EVERY == 0:
+        print(f"[Final CSV checkpoint] Saving {len(final_data)} rows to {FINAL_CSV}")
+        df_partial = pd.DataFrame(final_data)
+        atomic_save_csv(df_partial, final_path)
+
+    if shutdown:
+        print("Shutdown requested; saving final CSV and exiting.")
+        df_partial = pd.DataFrame(final_data)
+        atomic_save_csv(df_partial, final_path)
+        break
+
+df_final = pd.DataFrame(final_data)
+atomic_save_csv(df_final, final_path)
+print(f"Final dataset saved to {FINAL_CSV} ({len(df_final)} rows).")
diff --git a/data/scripts/embd_approach.py b/data/scripts/embd_approach.py
@@ -0,0 +1,166 @@
+import pandas as pd
+import numpy as np
+import faiss
+import pickle
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+
+def create_faiss_food_merger():
+    print("Loading CSV files...")
+    mm_food_df = pd.read_csv('MM-Food-100K.csv')
+    a2m_df = pd.read_csv('3A2M.csv')
+
+    print(f"MM Food dataset shape: {mm_food_df.shape}")
+    print(f"3A2M dataset shape: {a2m_df.shape}")
+
+    mm_food_names = mm_food_df['dish_name'].fillna('').astype(str).tolist()
+    a2m_names = a2m_df['title'].fillna('').astype(str).tolist()
+
+    print("Loading embedding model...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    embedding_dim = 384
+
+    print("Creating embeddings for MM Food dishes...")
+    mm_embeddings = []
+    for name in tqdm(mm_food_names, desc="MM Food embeddings"):
+        embedding = model.encode(name)
+        mm_embeddings.append(embedding)
+
+    mm_embeddings = np.array(mm_embeddings).astype('float32')
+    faiss.normalize_L2(mm_embeddings)
+
+    print("Building FAISS index for MM Food...")
+    mm_index = faiss.IndexFlatIP(embedding_dim)
+    mm_index.add(mm_embeddings)
+
+    print("Saving MM Food FAISS index...")
+    faiss.write_index(mm_index, 'mm_food_index.faiss')
+    mm_metadata = {
+        'names': mm_food_names,
+        'dataframe_indices': list(range(len(mm_food_names)))
+    }
+    with open('mm_food_metadata.pkl', 'wb') as f:
+        pickle.dump(mm_metadata, f)
+
+    print("Creating embeddings for 3A2M titles...")
+    a2m_embeddings = []
+    for name in tqdm(a2m_names, desc="3A2M embeddings"):
+        embedding = model.encode(name)
+        a2m_embeddings.append(embedding)
+
+    a2m_embeddings = np.array(a2m_embeddings).astype('float32')
+    faiss.normalize_L2(a2m_embeddings)
+
+    print("Building FAISS index for 3A2M...")
+    a2m_index = faiss.IndexFlatIP(embedding_dim)
+    a2m_index.add(a2m_embeddings)
+
+    print("Saving 3A2M FAISS index...")
+    faiss.write_index(a2m_index, 'a2m_index.faiss')
+    a2m_metadata = {
+        'names': a2m_names,
+        'dataframe_indices': list(range(len(a2m_names)))
+    }
+    with open('a2m_metadata.pkl', 'wb') as f:
+        pickle.dump(a2m_metadata, f)
+
+    print("Finding closest matches for each 3A2M item...")
+    final_rows = []
+    threshold = 0.99
+    matches_above_threshold = 0
+
+    batch_size = 100
+    for i in tqdm(range(0, len(a2m_embeddings), batch_size), desc="Processing matches"):
+        batch_end = min(i + batch_size, len(a2m_embeddings))
+        batch_embeddings = a2m_embeddings[i:batch_end]
+
+        similarities, indices = mm_index.search(batch_embeddings, 1)
+
+        for j, (similarity, mm_idx) in enumerate(zip(similarities, indices)):
+            original_a2m_idx = i + j
+            similarity_score = float(similarity[0])
+            mm_match_idx = int(mm_idx[0])
+
+            if similarity_score >= threshold:
+                matches_above_threshold += 1
+
+            a2m_row = a2m_df.iloc[original_a2m_idx].to_dict()
+            mm_row = mm_food_df.iloc[mm_match_idx].to_dict()
+
+            merged_row = {}
+
+            for col, val in a2m_row.items():
+                merged_row[f'3A2M_{col}'] = val
+
+            for col, val in mm_row.items():
+                merged_row[f'MM_Food_{col}'] = val
+
+            merged_row['similarity_score'] = similarity_score
+            merged_row['above_threshold'] = similarity_score >= threshold
+
+            final_rows.append(merged_row)
+
+
+    print(f"Total matches processed: {len(final_rows)}")
+    print(f"Matches above threshold {threshold}: {matches_above_threshold}")
+
+    final_df = pd.DataFrame(final_rows)
+
+    print(f"Final dataset shape: {final_df.shape}")
+    print(f"Columns in final dataset: {len(final_df.columns)}")
+
+    final_df.to_csv('merged_food_dataset_faiss.csv', index=False)
+    print("Saved complete merged dataset to 'merged_food_dataset_faiss.csv'")
+
+    high_similarity_df = final_df[final_df['above_threshold'] == True]
+    if len(high_similarity_df) > 0:
+        high_similarity_df.to_csv('high_similarity_matches.csv', index=False)
+        print(f"Saved {len(high_similarity_df)} high similarity matches to 'high_similarity_matches.csv'")
+
+    return final_df
+
+def search_food_similarity(query_food_name, dataset='mm_food', top_k=1):    
+    if dataset == 'mm_food':
+        index_file = 'mm_food_index.faiss'
+        metadata_file = 'mm_food_metadata.pkl'
+    else:
+        index_file = 'a2m_index.faiss'
+        metadata_file = 'a2m_metadata.pkl'
+
+    if not os.path.exists(index_file):
+        print(f"Index file {index_file} not found. Run create_faiss_food_merger() first.")
+        return None
+
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    index = faiss.read_index(index_file)
+
+    with open(metadata_file, 'rb') as f:
+        metadata = pickle.load(f)
+
+    query_embedding = model.encode(query_food_name).astype('float32').reshape(1, -1)
+    faiss.normalize_L2(query_embedding)
+
+    similarities, indices = index.search(query_embedding, top_k)
+
+    results = []
+    for sim, idx in zip(similarities[0], indices[0]):
+        results.append({
+            'food_name': metadata['names'][idx],
+            'similarity': float(sim),
+            'index': int(idx)
+        })
+
+    return results
+
+if __name__ == "__main__":
+    merged_df = create_faiss_food_merger()
+
+    print("\nSample of merged data:")
+    if len(merged_df) > 0:
+        print(merged_df[['3A2M_title', 'MM_Food_dish_name', 'similarity_score', 'above_threshold']].head())
+
+    print(f"\nTotal rows in final dataset: {len(merged_df)}")
+    print(f"High similarity matches (>= 0.99): {len(merged_df[merged_df['above_threshold'] == True])}")
+    print(f"Average similarity score: {merged_df['similarity_score'].mean():.4f}")