diff --git a/data/scripts/approach1.py b/data/scripts/approach1.py new file mode 100644 index 0000000..da79acc --- /dev/null +++ b/data/scripts/approach1.py @@ -0,0 +1,152 @@ +import os +import pickle +import tempfile +from pathlib import Path +import pandas as pd +from rapidfuzz import process +from tqdm import tqdm +import signal + +RECIPES_CSV = "3A2M.csv" +FOOD_CSV = "MM-Food-100K.csv" + +MATCH_CHECKPOINT = "matches_checkpoint.pkl" +PROGRESS_STATE = "progress_state.pkl" +FINAL_CSV = "final_df.csv" +SAVE_EVERY = 500 +ATOMIC_TMP = ".tmp_save" +SCORE_CUTOFF = 95 + +def atomic_save(obj, path: Path): + tmp = path.with_suffix(path.suffix + ATOMIC_TMP) + with open(tmp, "wb") as f: + pickle.dump(obj, f) + os.replace(tmp, path) + +def atomic_save_csv(df: pd.DataFrame, path: Path): + tmp = path.with_suffix(path.suffix + ATOMIC_TMP) + df.to_csv(tmp, index=False) + os.replace(tmp, path) + +def load_pickle_if_exists(path: Path, default): + if path.exists(): + with open(path, "rb") as f: + return pickle.load(f) + return default + +shutdown = False +def _handle_signal(sig, frame): + global shutdown + shutdown = True +signal.signal(signal.SIGINT, _handle_signal) + +recipes_df = pd.read_csv(RECIPES_CSV) +food_df = pd.read_csv(FOOD_CSV) + +recipes_sample = recipes_df +food_sample = food_df + +print("Recipes dataset:", recipes_sample.shape) +print("Food dataset:", food_sample.shape) + +food_names = food_sample['dish_name'].dropna().unique() +food_names = [str(n).strip().lower() for n in food_names] + +matches_path = Path(MATCH_CHECKPOINT) +state_path = Path(PROGRESS_STATE) +final_path = Path(FINAL_CSV) + +matches = load_pickle_if_exists(matches_path, default=[]) +state = load_pickle_if_exists(state_path, default={"last_idx": 0}) + +start_idx = state.get("last_idx", 0) +print(f"Resuming matching from index {start_idx}. Already found {len(matches)} matches.") + +titles = recipes_sample['title'].dropna().unique() +total_titles = len(titles) + +for offset, raw_title in enumerate(titles[start_idx:], start=start_idx): + if shutdown: + print("Shutdown requested; saving state and exiting matching loop.") + state['last_idx'] = offset + atomic_save(state, state_path) + atomic_save(matches, matches_path) + break + + title = str(raw_title).strip().lower() + match = process.extractOne(title, food_names, score_cutoff=SCORE_CUTOFF) + if match is not None: + best_match, score, _ = match + matches.append((raw_title, best_match, score)) + + if len(matches) % SAVE_EVERY == 0: + print(f"[Checkpoint] Found {len(matches)} matches. Saving...") + state['last_idx'] = offset + 1 + atomic_save(matches, matches_path) + atomic_save(state, state_path) + +else: + state['last_idx'] = total_titles + atomic_save(matches, matches_path) + atomic_save(state, state_path) + print("Completed matching loop. Saved final matches checkpoint.") + +matches = load_pickle_if_exists(matches_path, default=matches) +print(f"Total matches to process into final dataset: {len(matches)}") + +if final_path.exists(): + existing_df = pd.read_csv(final_path) + processed_count = len(existing_df) + final_data = existing_df.to_dict(orient="records") + print(f"Found existing final CSV with {processed_count} rows. Resuming from there.") +else: + final_data = [] + processed_count = 0 + +for idx, (title, dish_name_lower, score) in enumerate(tqdm(matches, desc="Building final dataset"), start=0): + if idx < processed_count: + continue + + rec_rows = recipes_sample[recipes_sample['title'].str.strip().str.lower() == str(title).strip().lower()] + if rec_rows.empty: + recipe_row = {} + else: + recipe_row = rec_rows.iloc[0] + + food_rows = food_sample[food_sample['dish_name'].str.strip().str.lower() == dish_name_lower] + if food_rows.empty: + food_row = {} + else: + food_row = food_rows.iloc[0] + + raw_url = food_row.get('image_url', None) if isinstance(food_row, pd.Series) else None + image_file = raw_url.replace("https://file.b18a.io/", "") if raw_url else None + + final_data.append({ + "dish_name(Recipe)": title, + "dish_name(MMFood)": dish_name_lower, + "file_path": image_file, + "recipe": recipe_row.get('directions', None) if isinstance(recipe_row, pd.Series) else None, + "ingredients(Recipe)": recipe_row.get('NER', None) if isinstance(recipe_row, pd.Series) else None, + "ingredients(MMFood)": food_row.get('ingredients', None) if isinstance(food_row, pd.Series) else None, + "nutritional_profile": food_row.get('nutritional_profile', None) if isinstance(food_row, pd.Series) else None, + "cooking_method": food_row.get('cooking_method', None) if isinstance(food_row, pd.Series) else None, + "food_type": food_row.get('food_type', None) if isinstance(food_row, pd.Series) else None, + "genre": recipe_row.get('genre', None) if isinstance(recipe_row, pd.Series) else None, + "score": score + }) + + if (idx + 1) % SAVE_EVERY == 0: + print(f"[Final CSV checkpoint] Saving {len(final_data)} rows to {FINAL_CSV}") + df_partial = pd.DataFrame(final_data) + atomic_save_csv(df_partial, final_path) + + if shutdown: + print("Shutdown requested; saving final CSV and exiting.") + df_partial = pd.DataFrame(final_data) + atomic_save_csv(df_partial, final_path) + break + +df_final = pd.DataFrame(final_data) +atomic_save_csv(df_final, final_path) +print(f"Final dataset saved to {FINAL_CSV} ({len(df_final)} rows).") diff --git a/data/scripts/embd_approach.py b/data/scripts/embd_approach.py new file mode 100644 index 0000000..b748762 --- /dev/null +++ b/data/scripts/embd_approach.py @@ -0,0 +1,166 @@ +import pandas as pd +import numpy as np +import faiss +import pickle +from sentence_transformers import SentenceTransformer +from tqdm import tqdm +import warnings +warnings.filterwarnings('ignore') + +def create_faiss_food_merger(): + print("Loading CSV files...") + mm_food_df = pd.read_csv('MM-Food-100K.csv') + a2m_df = pd.read_csv('3A2M.csv') + + print(f"MM Food dataset shape: {mm_food_df.shape}") + print(f"3A2M dataset shape: {a2m_df.shape}") + + mm_food_names = mm_food_df['dish_name'].fillna('').astype(str).tolist() + a2m_names = a2m_df['title'].fillna('').astype(str).tolist() + + print("Loading embedding model...") + model = SentenceTransformer('all-MiniLM-L6-v2') + embedding_dim = 384 + + print("Creating embeddings for MM Food dishes...") + mm_embeddings = [] + for name in tqdm(mm_food_names, desc="MM Food embeddings"): + embedding = model.encode(name) + mm_embeddings.append(embedding) + + mm_embeddings = np.array(mm_embeddings).astype('float32') + faiss.normalize_L2(mm_embeddings) + + print("Building FAISS index for MM Food...") + mm_index = faiss.IndexFlatIP(embedding_dim) + mm_index.add(mm_embeddings) + + print("Saving MM Food FAISS index...") + faiss.write_index(mm_index, 'mm_food_index.faiss') + mm_metadata = { + 'names': mm_food_names, + 'dataframe_indices': list(range(len(mm_food_names))) + } + with open('mm_food_metadata.pkl', 'wb') as f: + pickle.dump(mm_metadata, f) + + print("Creating embeddings for 3A2M titles...") + a2m_embeddings = [] + for name in tqdm(a2m_names, desc="3A2M embeddings"): + embedding = model.encode(name) + a2m_embeddings.append(embedding) + + a2m_embeddings = np.array(a2m_embeddings).astype('float32') + faiss.normalize_L2(a2m_embeddings) + + print("Building FAISS index for 3A2M...") + a2m_index = faiss.IndexFlatIP(embedding_dim) + a2m_index.add(a2m_embeddings) + + print("Saving 3A2M FAISS index...") + faiss.write_index(a2m_index, 'a2m_index.faiss') + a2m_metadata = { + 'names': a2m_names, + 'dataframe_indices': list(range(len(a2m_names))) + } + with open('a2m_metadata.pkl', 'wb') as f: + pickle.dump(a2m_metadata, f) + + print("Finding closest matches for each 3A2M item...") + final_rows = [] + threshold = 0.99 + matches_above_threshold = 0 + + batch_size = 100 + for i in tqdm(range(0, len(a2m_embeddings), batch_size), desc="Processing matches"): + batch_end = min(i + batch_size, len(a2m_embeddings)) + batch_embeddings = a2m_embeddings[i:batch_end] + + similarities, indices = mm_index.search(batch_embeddings, 1) + + for j, (similarity, mm_idx) in enumerate(zip(similarities, indices)): + original_a2m_idx = i + j + similarity_score = float(similarity[0]) + mm_match_idx = int(mm_idx[0]) + + if similarity_score >= threshold: + matches_above_threshold += 1 + + a2m_row = a2m_df.iloc[original_a2m_idx].to_dict() + mm_row = mm_food_df.iloc[mm_match_idx].to_dict() + + merged_row = {} + + for col, val in a2m_row.items(): + merged_row[f'3A2M_{col}'] = val + + for col, val in mm_row.items(): + merged_row[f'MM_Food_{col}'] = val + + merged_row['similarity_score'] = similarity_score + merged_row['above_threshold'] = similarity_score >= threshold + + final_rows.append(merged_row) + + + print(f"Total matches processed: {len(final_rows)}") + print(f"Matches above threshold {threshold}: {matches_above_threshold}") + + final_df = pd.DataFrame(final_rows) + + print(f"Final dataset shape: {final_df.shape}") + print(f"Columns in final dataset: {len(final_df.columns)}") + + final_df.to_csv('merged_food_dataset_faiss.csv', index=False) + print("Saved complete merged dataset to 'merged_food_dataset_faiss.csv'") + + high_similarity_df = final_df[final_df['above_threshold'] == True] + if len(high_similarity_df) > 0: + high_similarity_df.to_csv('high_similarity_matches.csv', index=False) + print(f"Saved {len(high_similarity_df)} high similarity matches to 'high_similarity_matches.csv'") + + return final_df + +def search_food_similarity(query_food_name, dataset='mm_food', top_k=1): + if dataset == 'mm_food': + index_file = 'mm_food_index.faiss' + metadata_file = 'mm_food_metadata.pkl' + else: + index_file = 'a2m_index.faiss' + metadata_file = 'a2m_metadata.pkl' + + if not os.path.exists(index_file): + print(f"Index file {index_file} not found. Run create_faiss_food_merger() first.") + return None + + model = SentenceTransformer('all-MiniLM-L6-v2') + index = faiss.read_index(index_file) + + with open(metadata_file, 'rb') as f: + metadata = pickle.load(f) + + query_embedding = model.encode(query_food_name).astype('float32').reshape(1, -1) + faiss.normalize_L2(query_embedding) + + similarities, indices = index.search(query_embedding, top_k) + + results = [] + for sim, idx in zip(similarities[0], indices[0]): + results.append({ + 'food_name': metadata['names'][idx], + 'similarity': float(sim), + 'index': int(idx) + }) + + return results + +if __name__ == "__main__": + merged_df = create_faiss_food_merger() + + print("\nSample of merged data:") + if len(merged_df) > 0: + print(merged_df[['3A2M_title', 'MM_Food_dish_name', 'similarity_score', 'above_threshold']].head()) + + print(f"\nTotal rows in final dataset: {len(merged_df)}") + print(f"High similarity matches (>= 0.99): {len(merged_df[merged_df['above_threshold'] == True])}") + print(f"Average similarity score: {merged_df['similarity_score'].mean():.4f}")