PathoSpreading/process_files.py at main · MathieuBo/PathoSpreading · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pandas as pd
import numpy as np


def process_pathdata(exp_data, connectivity_ipsi, connectivity_contra):

    """
    Function that processes the quantified alpha-synuclein data and the connectivity maps (ipsilateral and controlateral)
    ---
    Inputs:
        exp_data --> Table that contains the values of the quantified alpha-synuclein pathology measured at different
        timepoints and with different conditions.
        connectivity_ipsi/contra --> Connectivity tables, the index and the columns are the regions of interest


    ---
    Outputs:
        W --> adjacency matrix shaped in a DataFrame
        path_data --> experimental DataFrame that contains the alpha-syn quantification
        conn_names --> List of brain region names
        ROI_names --> ordered list of iRegions and then cRegions
    """
    conn_names = [i.split(' (')[0] for i in connectivity_contra.columns]

    # Extract the name of the brain regions but not the subregions
    path_names = exp_data.columns[2::]

    c_path_names_contra = [path_names[i] for i in range(0, len(path_names)) if
                           path_names[i][0] == "c"]  # Extraction of the names of the regions starting by a "c"
    path_names_contra = [i[1:] for i in c_path_names_contra]  # Same without the first letter (to fit with conn.names)

    i_path_names_ipsi = [path_names[i] for i in range(0, len(path_names)) if path_names[i][0] == "i"]
    path_names_ipsi = [i[1:] for i in i_path_names_ipsi]

    # Order path and connectivity by the same name
    path_data_ipsi = exp_data.loc[:, i_path_names_ipsi]  # Creates a DataFrame with reorganized columns
    path_data_contra = exp_data.loc[:, c_path_names_contra]

    ordered_matched_ipsi = []
    for i in range(0, len(conn_names)):  # len(conn_names) = 58
        for k in range(0, len(path_names_ipsi)):  # len(path_names_ipsi) =
            if path_names_ipsi[k] == conn_names[i]:
                ordered_matched_ipsi.append(k)
                # Returns a list containing the ranks of path_names_ipsi that fits conn.names

    ordered_matched_contra = []
    for i in range(0, len(conn_names)):
        for k in range(0, len(path_names_contra)):
            if path_names_contra[k] == conn_names[i]:
                ordered_matched_contra.append(k)
                # Returns a list containing the ranks of path_names_contra that fits conn.names

    # Creation of path_data
    path_data = pd.concat([exp_data.loc[:, exp_data.columns[0:2]._index_data],
                       path_data_ipsi.loc[:, path_data_ipsi.columns._index_data[ordered_matched_ipsi]],
                       path_data_contra.loc[:, path_data_contra.columns._index_data[ordered_matched_contra]]],
                      axis=1)

    path_data = path_data.rename(columns={"MBSC Region": "Conditions"})  # Renaming a column

    #Reorganizing so that the seed "iCPu" is the first column
    # tile matrix such that sources are rows, columns are targets (Oh et al. 2014 Fig 4)
    connectivity_ipsi.columns = conn_names
    # Sets the names of the columns and the index to be the same using the list conn_names
    connectivity_ipsi.index = conn_names

    connectivity_contra.columns = conn_names
    connectivity_contra.index = conn_names

    W = pd.concat([pd.concat([connectivity_ipsi, connectivity_contra], axis=1),
                   pd.concat([connectivity_contra, connectivity_ipsi], axis=1)], axis=0)

    # Checking if the matrix was tiled properly
    if ((W.iloc[0:57, 0:57] != W.iloc[0:57, 0:57]).sum()).sum() > 0:  # Summing over columns and then rows
        print("!!! Adjacency matrix: failed concatenation !!!")  # If False the double sum equals 0
    else:
        print('Adjacency matrix: successful concatenation')

    # retain indices to reorder like original data variable for plotting on mouse brains
    ROInames = ["i" + i for i in conn_names] + ["c" + i for i in conn_names]

    # List of ROI w/ first the contro regions and then the ipsi regions.
    orig_order = []
    for i in range(0, len(ROInames)):  # Reordering according to ROInames
        for k in range(0, len(exp_data.columns._index_data) - 2):
            if ROInames[k] == exp_data.columns._index_data[2::][i]:
                orig_order.append(k)  # List containing the index of the original data

    return W, path_data, conn_names, ROInames


def process_gene_expression_data(expression, roi_names):
    """ Function to process gene expression data
    ---
    Inputs:
        expression --> Pandas DataFrame containing the gene expression data per region
        roi_names --> ROInames created in process_pathdata. List of ordered ROInames (first iROI then cROI)
    ---
    Outputs:
        Ordered expression data as pandas Dataframe. Panda DataFrame
    """

    expression_ordered = []
    for i in range(0, len(roi_names)):  # Reordering according to ROInames
        for k in range(0, len(expression.index)):
            if roi_names[i] == expression.index[k]:
                expression_ordered.append(expression.index[k])

    return expression.loc[expression_ordered, :]  # Reordered Snca expression


def mean_pathology(timepoints, path_data):
    """
    Process experimental data to return mean per group and per timepoint
    ---
    Inputs:
        timepoints: list of experimental timepoints
        path_data: path_data created in process_path_data. Contains the pathology data quantified.
    ---
    Outputs:
        grp_mean: Dataframe with mean pathology per group, timepoints and regions
        or
        ind_grp: Multi-index Dataframe: first column index (1,3,6) (MPI), second column index (1,2,3,...)
        (Number of animals used) to call a specific column ==> ind_grp.loc[:, ('1', '1')]
        multi_index: Returns the MultiIndex Dataframe
    """

    mice = []
    for time in timepoints:  # Creation of a list of 3 panda dataframe. These 3 df correspond to the 3 tp
        l = path_data[(path_data[path_data.columns[0]] == time) & (path_data[path_data.columns[1]] == 'NTG')][
            path_data.columns[2::]]
        l = l.reset_index(drop=True)
        # Reset the index, drop = True is used to remove the old index as it is by default added as a column
        mice.append(l)  # list of Dataframe

    grp_mean = []
    for i in np.arange(len(timepoints)):
        grp_mean.append(mice[i].mean())  # Careful, now the mean are display as columns and not as rows anymore
    grp_mean = pd.concat([grp_mean[i] for i in np.arange(len(timepoints))], axis=1)
    grp_mean.columns = ["MPI {}".format(i) for i in timepoints]

    return grp_mean