-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUniqpepextractor_v10.py
109 lines (90 loc) · 4.84 KB
/
Uniqpepextractor_v10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import read_fasta_file
import tryptic_peptide
import sys
from datetime import date, datetime
cwd = os.getcwd()
today = date.today()
dt = today.strftime("%m%d%y")
now = datetime.now()
current_time = now.strftime("%H%M%S")
def generate_folder(infile_path, outfile_path):
for list_dir in os.listdir(infile_path):
if list_dir.split('.')[-1] == 'fasta':
InputFilePath = ''
ResultPath = ''
path = outfile_path + list_dir.split('.fasta')[0]
try:
out_file = os.makedirs(path)
except:
print ('Folder for ' + list_dir + ' has not generated.')
aa_known = {}
aa_unknown = {}
def amino_acids(AA_seq):
aa_dicts = {'G': 'Glycine@Gly', 'A': 'Alanine@Ala', 'L': 'Leucine@Leu', 'M': 'Methionine@Met', 'F': 'Phenylalanine@Phe', 'W': 'Tryptophan@Trp', 'K': 'Lysine@Lys', 'Q': 'Glutamine@Gln',
'E': 'Glutamic Acid@Glu', 'S': 'Serine@Ser', 'P': 'Proline@Pro', 'V': 'Valine@Val', 'I': 'Isoleucine@Ile', 'C': 'Cysteine@Cys', 'Y': 'Tyrosine@Tyr', 'H': 'Histidine@His',
'R': 'Arginine@Arg', 'N': 'Asparagine@Asn', 'D': 'Aspartic Acid@Asp', 'T': 'Threonine@Thr', 'U' : 'Selenocysteine@Sec'}
for a in AA_seq.upper():
if a in aa_dicts:
aa_known[a] = aa_dicts[a]
else:
aa_unknown[a] = ''
for l, m in aa_unknown.items():
#print (l)
return l
def Unique_pep(infile, outfile, miss_cleave, min_len, max_len):
tryptic_pep = {}
list_fasta = {}
fasta_file = [infile + '/' + list_file for list_file in os.listdir(infile) if os.path.isfile(infile + '/' + list_file) if list_file.split('.')[-1] == 'fasta']
for f in fasta_file:
list_fasta[f.split('/')[-1]] = f.split('/')[-1]
fasta = read_fasta_file.read_fasta(f)
for rows in fasta:
seq = rows[1].rstrip()
aa_count = amino_acids(seq)
for iter_cleavage in range(int(miss_cleave) + 1):
pep = tryptic_peptide.tryptic_peptide_trypsin(seq,iter_cleavage,int(min_len),int(max_len))
for i in pep:
#print (i, rows[0], protein_fasta_file)
if 'C' in i: #remove peptides having amino acid cysteine
rm = i
elif 'M' in i: #remove peptides having amino acid methionine
rm = i
elif 'X' in i: #remove peptides having amino acid X
rm = i
elif 'Z' in i: #remove peptides having amino acid Z
rm = i
else:
if i not in tryptic_pep:
tryptic_pep[i] = [rows[0] + '\t' + f.split('/')[-1]]
else:
tryptic_pep[i].append(rows[0] + '\t' + f.split('/')[-1])
print ('Protein digestion by trypsin is complete and ' + str(len(tryptic_pep)) + ' unique peptides are stored')
print ('Number of known amino acids found: ' + str(len(aa_known)))
for k, v in aa_known.items():
print (k + '\t' + v.split('@')[0] + '\t' + v.split('@')[1])
print ('Number of unknown amino acids found: ' + str(len(aa_unknown)))
for l, m in aa_unknown.items():
print (l)
generate_folder(infile, outfile) # Generate folders to store the species-specific output file
print ('Species specific output folders are generated.')
for k, v in tryptic_pep.items():
if len(v) == 1:
for j in v:
if j.split('\t')[-1] in list_fasta:
for list_folder in os.listdir(outfile):
file = list_folder + '.fasta'
try:
if j.split('\t')[-1] in file:
path = outfile + '/' + list_folder
#print (path + '/' + list_fasta[j.split('\t')[-1]].rstrip('.fasta') + '_' + 'Unique_Peptides.txt')
#print (k + '\t' + j.split('\t')[0].split(' ')[0] + '\t' + j.split('\t')[0] + '\t' + str(len(k)) + '\t' + j.split('\t')[-1] + '\n')
write1 = open(path + '/' + list_fasta[j.split('\t')[-1]].rstrip('.fasta') + '_Unique_Peptides_' + dt + '.txt', 'a')
write1.write(k + '\t' + j.split('\t')[0].split(' ')[0] + '\t' + j.split('\t')[0] + '\t' + str(len(k)) + '\t' + j.split('\t')[-1] + '\n')
write1.close()
except:
pass
#print (list_folder)
if __name__== "__main__":
Unique_pep(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
#path = 'python Uniqpepextractor_v10.py D:/Skyline/NTMs/ALL_Mycobcaterium_Species/ D:/Skyline/NTMs/NTMs_In_silico_Peptides/ 0 7 25'