DPCstruct/run_example.sh at main · RitAreaSciencePark/DPCstruct · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash

set -e
# Define the cleanup function
cleanup() {
    echo "Performing cleanup..."
    rm -rf ./example/alns_filtered
    rm -rf ./example/primaryclusters
    rm -rf ./example/secondaryclusters
    rm -rf ./example/output
    echo "Cleanup complete."
}
# trap cleanup EXIT

# This file contains a step-by-step guide on how to run the DPCstruct pipeline.
# For this purpose we will use a small dataset of aroun 1250 proteins as an example.
# To keep the repository light we will assume the proteins are already compressed into a Foldseek database using the following commands:

## 0. Preprocessing
# protLookup="./example/proteins.tsv"
# change pdbs filenames to indexes (this step can be performed after all vs. all)
# while IFS=' ' read -r index file; do mv "${file}.pdb" "${index}"; done < ${protLookup}

### create db
# foldseek createdb example_pdbs/ example_db/example_db

## 1. Search all vs. all
# queryDB="./example/database/example_db"
# targetDB="./example/database/example_db"
# alns="./example/alns/alns"
# alnsConverted="./example/alns/alns.tsv"
# mkdir -p "$(dirname "$alns")"
# tmpDir="./tmp"
# mkdir ${tmpDir}

### foldseek search (--max-seqs=100 000 for bigger datasets)
# foldseek search ${queryDB} ${targetDB} ${alns} ${tmpDir}  -s 7.5 --max-seqs 1000 -e 0.001 -a --threads ${SLURM_CPUS_PER_TASK}
# foldseek convertalis ${queryDB} ${targetDB} ${alns} ${alnsConverted} --format-mode 4 --format-output query,target,qstart,qend,tstart,tend,qlen,tlen,alnlen,pident,evalue,bits,alntmscore,lddt

# We start the pipeline from the prefilter step:
protLookup="./example/proteins.tsv"
alnsDir="./example/alns"
alnsConverted="${alnsDir}/alns.tsv"

mkdir -p ${alnsDir}

# Only unzip if the file does not exist.
if [[ ! -f "$alnsConverted" ]]; then
    unzip ./example/alns.zip -d ${alnsDir}
fi

# Check if the file exists after extraction.
if [[ ! -f "$alnsConverted" ]]; then
    echo "Error: File $alnsConverted does not exist or failed to extract."
    exit 1
fi

## 2. Prefilters
alnsFilteredDir="./example/alns_filtered"
alnsFiltered="${alnsFilteredDir}/alns_filtered.tsv"
mkdir -p "${alnsFilteredDir}"

dpcstruct prefilters -i ${alnsConverted} -m ${protLookup} -o ${alnsFiltered}

## 3. Primary clustering
pcsDir="./example/primaryclusters"
pcsFile="${pcsDir}/pcs.bin"
mkdir -p "${pcsDir}"

dpcstruct primarycluster -i ${alnsFiltered} -o ${pcsFile} -t 4

## 4. Secondary clustering
scDistDir="./example/secondaryclusters/distance"
scLabelsDir="./example/secondaryclusters/classification"
scDistFile="${scDistDir}/distance_matrix.bin"
scLabelsFile="${scLabelsDir}/sc_classification.txt"
mkdir -p "${scDistDir}"
mkdir -p "${scLabelsDir}"

dpcstruct secondarycluster distance -i ${pcsFile} -j ${pcsFile} -o ${scDistFile} -p 1 -c 4
dpcstruct secondarycluster classify ${scDistFile} -o ${scLabelsFile}

## 5. Traceback
tracebackDir="./example/output/binary"
mkdir -p "${tracebackDir}"
dpcstruct traceback ${pcsFile} -l ${scLabelsFile} -o ${tracebackDir}

## 6. Postfilter
outputDir="./example/output"
mkdir -p "./example/output"
dpcstruct postfilters -i ${tracebackDir}/sequence-labels_1.bin -o ${outputDir}/sequence-labels_1.txt