cui_grouping/code/bernotate.py at master · vanboefer/cui_grouping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import argparse
import requests
import json
import logging
import datetime
import time
import pandas as pd
from pathlib import Path


# logging
logging.basicConfig(
    filename='fail_log',
    level=logging.WARNING)
log = logging.getLogger(__name__)


URL_BERN = 'https://bern.korea.ac.kr/plain'


def bernotate_df(df, path, fail_limit):
    """
    Annotate the text in the `df` with BERN and save the results in json files (a file per record) in the directory specified by `path`.

    Parameters
    ----------
    df: pandas DataFrame
        dataframe that contains the text for annotation in a column called 'all_text_clean' and a unique record id as the index
    path: str
        path to the directory where the output json files are stored
    fail_limit: int
        the number of consecutive errors (when sending a request for BERN annotation) after which the annotation procedure breaks

    Returns
    -------
    None
    """

    start       = time.time()
    length      = len(df)
    fail_count  = 0
    total_fails = 0

    for cnt, row in enumerate(df.itertuples()):
        idx = row.Index
        text = row.all_text_clean
        display_status(cnt, idx, length)
        status = bernotate_text(idx, text, path)

        if not status:
            fail_count  += 1
            total_fails += 1
            if fail_count > fail_limit and fail_limit > 0:
                print(f"FAILED {fail_count} TIMES. BREAKING DOWN PROCEDURE.")
                break

    exc_time = str(datetime.timedelta(seconds=time.time() - start))
    print(
        "BERN annotation completed; execution time: "
        f"{exc_time} hh:mm:ss; "
        f"{cnt + 1} records processed; "
        f"{total_fails} records failed."
    )


def bernotate_text(idx, text, path):
    """
    Annotate `text` with BERN and store the results in a json file (file name defined by `idx`) in the directory specified by `path`.

    Parameters
    ----------
    idx: {str, int}
        unique identifier of the processed record
    text: str
        text to be annotated
    path: str
        path to the directory where the output json file is stored

    Returns
    -------
    True: bool
        if annotation succeeded
    False: bool
        if annotation failed
    """

    try:
        data = {'sample_text': text}
        response = requests.post(URL_BERN, data=data)
        response.raise_for_status()
        print('| status_code: ', response.status_code)

        parsed = response.json()
        file = path / f"{idx}.json"
        with file.open('w') as f:
            json.dump(parsed, f)
        return True

    except requests.exceptions.ConnectionError:
        log.warning(f"{idx} failed on ConnectionError")
        print('| connection failed')
        return False

    except requests.exceptions.HTTPError:
        log.warning(f"{idx} failed on HTTPError")
        print('| http failed')
        return False

    except json.decoder.JSONDecodeError:
        log.warning(f"{idx} failed on JSONDecodeError")
        print('| json decoder failed')
        return False


def display_status(cnt, idx, length):
    """
    Displays progress of the annotation (see the function `bernotate_df`).
    """

    percentage = f"({((cnt + 1) / length) * 100:.1f} %)"
    print(
        f"{cnt + 1:0{len(str(length))}d} / {length} "
        f"{percentage:>9} | "
        f"Processing record {idx}", end=' ', flush=True,
    )


if __name__ == '__main__':

    """
    Annotate text with BERN and save the results in json files (a file per record) in the directory specified by the '--path_out' parameter.

    The text used for annotation is found under the 'all_text_clean' column in the pickled dataframe specified by the '--data' parameter.

    To determine the batch, use the parameters '--start' and '--end';
    this refers to the row index in the pickled dataframe specified by the '--data' parameter.

    The '--fail_limit' parameter determines the number of consecutive errors
    (when sending a request for BERN annotation) after which the annotation procedure breaks.
    """

    ############## PARAMETERS ##############
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--data', default='../data/data_sample.pkl')
    argparser.add_argument('--path_out', default='../data/bern_json/')
    argparser.add_argument('--start', default=0)
    argparser.add_argument('--end', default=200)
    argparser.add_argument('--fail_limit', default=5)
    args = argparser.parse_args()

    path = Path(args.path_out)
    path.mkdir(exist_ok=True, parents=True)

    ############## LOAD DATA ##############
    df = pd.read_pickle(args.data)

    ############## ANNOTATE with BERN ##############
    batch = df.iloc[int(args.start):int(args.end)+1]

    # check which studies in the batch are already annotated and exclude them
    print('Initial number of records: ', len(batch))
    idx = [f.stem for f in path.glob('*.json')]
    batch = batch[~batch.index.isin(idx)]
    print('Number of records to be processed: ', len(batch))

    bernotate_df(batch, path, fail_limit=int(args.fail_limit))