CEP/utils.py at master · NIVA-Knowledge-Graph/CEP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

from pubchempy import get_compounds, Compound
import pandas as pd
import numpy as np

def tanimoto(fp1, fp2):
    fp1 = int(fp1, 16)
    fp2 = int(fp2, 16)
    fp1_count = bin(fp1).count('1')
    fp2_count = bin(fp2).count('1')
    both_count = bin(fp1 & fp2).count('1')
    return float(both_count) / (fp1_count + fp2_count - both_count)

def get_fingerprint(cid):
    c = Compound.from_cid(cid)
    return c.cactvs_fingerprint

def load_data_csv(filename):
    df = pd.read_csv(filename)

    X,y = [], []
    for uri, value in zip(df['cid'],df['y']):
        cid = uri.split('CID')[-1]
        fp = get_fingerprint(cid)
        if fp:
            fp = [int(f) for f in fp]
            X.append(fp)
            y.append(float(value))
    return X,y

def save_data(X,Y,filename):
    with open(filename,'w') as f:
        for x,y in zip(X,Y):
            s = ','.join([str(i) for i in x])
            s += '|' + str(y) + '\n'
            f.write(s)

def load_data(filename):
    X,Y = [], []
    with open(filename,'r') as f:
        for l in f:
            x,y = l.strip().split('|')
            x = [float(i) for i in x.split(',')]
            y = float(y)
            X.append(x)
            Y.append(y)
    return X,Y


def write_results(filename, results, cols):
    i = len(results)
    divider = '|---------------' * len(cols) +'|\n'
    with open('results.md','w') as f:
        s = '|'+'|'.join(cols) + '|\n'
        f.write(s)
        f.write(divider)
        for dataset,r21,r22,r23 in results:
            s = '|'+str(dataset)+'|'+str(round(r21,3))+'|'+str(round(r22,3))+'|'+str(round(r23,3))+'|'+'\n'
            f.write(s)
            i -= 1
            if i > 0:
                f.write(divider)

def to_tanimoto(X1,X2):
    tmp = []
    for x1 in X1:
        for x2 in X2:
            a = '0b'+''.join([str(int(x)) for x in x1])
            b = '0b'+''.join([str(int(x)) for x in x2])
            tmp.append(tanimoto(a,b))
    X = np.asarray(tmp).reshape((len(X1),len(X2)))
    return X