NLP_Project/adding_type.py at main · WeskerPRO/NLP_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import json
from collections import Counter, defaultdict

def get_unique_paraphrase_type_ids(df, column="paraphrase_type_ids"):
    unique_ids = set()

    for val in df[column].dropna():
        try:
            ids = json.loads(val)  # Assuming string like "[4, 6, 26]"
            unique_ids.update(ids)
        except json.JSONDecodeError:
            continue  # Skip malformed rows

    return sorted(unique_ids)  # Sorted if you want the result ordered

# Load your CSV file (adjust path and delimiter)
df = pd.read_csv("./project/data/etpc-paraphrase-train.csv")

id_counts = Counter()
id_to_examples = defaultdict(list)
max_examples_per_id = 3


for idx, row in df.iterrows():
    # The paraphrase_type_ids column contains a string of a list, e.g. "[4, 6, 26]"
    # print(row["paraphrase_type_ids"])
    paraphrase_type_ids_str = row.get("paraphrase_type_ids", "")
    sentence1 = row.get("sentence1", "")
    sentence2 = row.get("sentence2", "")
    if pd.notna(paraphrase_type_ids_str) and paraphrase_type_ids_str:
        paraphrase_type_ids = json.loads(paraphrase_type_ids_str)

        id_counts.update(paraphrase_type_ids)

        for pid in paraphrase_type_ids:
            if len(id_to_examples[pid]) < max_examples_per_id:
                id_to_examples[pid].append((sentence1, sentence2))

# Print out ID counts and examples
unique_ids = get_unique_paraphrase_type_ids(df)
print(unique_ids)
'''
for pid, count in id_counts.most_common():
    print(f"ID: {pid} | Count: {count}")
    print("Examples:")
    for s1, s2 in id_to_examples[pid]:
        print(f"  S1: {s1}")
        print(f"  S2: {s2}")
    print('-' * 80)
'''