generated from GippLab-DNLP-Team/dnlp-final-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadding_type.py
More file actions
51 lines (41 loc) · 1.63 KB
/
adding_type.py
File metadata and controls
51 lines (41 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import json
from collections import Counter, defaultdict
def get_unique_paraphrase_type_ids(df, column="paraphrase_type_ids"):
unique_ids = set()
for val in df[column].dropna():
try:
ids = json.loads(val) # Assuming string like "[4, 6, 26]"
unique_ids.update(ids)
except json.JSONDecodeError:
continue # Skip malformed rows
return sorted(unique_ids) # Sorted if you want the result ordered
# Load your CSV file (adjust path and delimiter)
df = pd.read_csv("./project/data/etpc-paraphrase-train.csv")
id_counts = Counter()
id_to_examples = defaultdict(list)
max_examples_per_id = 3
for idx, row in df.iterrows():
# The paraphrase_type_ids column contains a string of a list, e.g. "[4, 6, 26]"
# print(row["paraphrase_type_ids"])
paraphrase_type_ids_str = row.get("paraphrase_type_ids", "")
sentence1 = row.get("sentence1", "")
sentence2 = row.get("sentence2", "")
if pd.notna(paraphrase_type_ids_str) and paraphrase_type_ids_str:
paraphrase_type_ids = json.loads(paraphrase_type_ids_str)
id_counts.update(paraphrase_type_ids)
for pid in paraphrase_type_ids:
if len(id_to_examples[pid]) < max_examples_per_id:
id_to_examples[pid].append((sentence1, sentence2))
# Print out ID counts and examples
unique_ids = get_unique_paraphrase_type_ids(df)
print(unique_ids)
'''
for pid, count in id_counts.most_common():
print(f"ID: {pid} | Count: {count}")
print("Examples:")
for s1, s2 in id_to_examples[pid]:
print(f" S1: {s1}")
print(f" S2: {s2}")
print('-' * 80)
'''