fatmacankara commited on
Commit
af56dfe
1 Parent(s): bb5a846

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +8 -33
code/pdb_featureVector.py CHANGED
@@ -163,8 +163,7 @@ def pdb(input_set, mode, impute):
163
  data.at[i, 'wt_sequence_match'] = 'i'
164
  data.at[i, 'whichIsoform'] = whichIsoform
165
  break
166
- print('MATCHING UNIPTOR')
167
- print(data.to_string())
168
  data.wt_sequence_match = data.wt_sequence_match.astype('str')
169
  data.replace({'': 'nan'}, inplace=True)
170
  data_size = len(data.drop_duplicates(['datapoint']))
@@ -196,7 +195,6 @@ def pdb(input_set, mode, impute):
196
  pdbs = [item for sublist in pdbs for item in sublist]
197
 
198
  else:
199
- print('PDB List Empty')
200
  pdbs = []
201
  print('Processing PDB structures...\n')
202
  if pdbs == []:
@@ -274,12 +272,7 @@ def pdb(input_set, mode, impute):
274
  if chain_id in pdb_data_list:
275
  # Print UniProt IDs, chain ID, and resolution for the current model
276
  chain_id = chain.get_id()
277
- #st.write(f"---- Information for Chain {chain_id} in Model {i} ----")
278
- #st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
279
- #st.write(f"Chain ID: {chain_id}")
280
- #st.write(f"PDB ID: {search.upper()}")
281
- #st.write(f"Resolution: {resolution}")
282
- #st.write(f"Sequence: {sequence}")
283
  pdb_fasta.at[index, 'pdbID'] = search
284
  pdb_fasta.at[index, 'chain'] = chain_id
285
  pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
@@ -289,10 +282,6 @@ def pdb(input_set, mode, impute):
289
  pdb_info.at[index, 'resolution'] = resolution
290
  index += 1
291
 
292
- st.write('PDB INFO')
293
- st.write(pdb_info)
294
- st.write('PDB FASTA')
295
- st.write(pdb_fasta)
296
  print('PDB file processing finished..')
297
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
298
  try:
@@ -308,15 +297,13 @@ def pdb(input_set, mode, impute):
308
  filename.rename(filename_replace_ext.with_suffix('.pdb'))
309
  except:
310
  FileNotFoundError
311
- st.write('uniprot_matched before')
312
- st.write(uniprot_matched)
313
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
314
  uniprot_matched = uniprot_matched.astype(str)
315
  uniprot_matched = uniprot_matched.drop_duplicates()
316
  uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
317
  uniprot_matched = uniprot_matched.astype(str)
318
- st.write('uniprot_matched after')
319
- st.write(uniprot_matched)
320
  with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
321
  (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
322
  uniprot_matched.resolution != 'None'))].drop_duplicates()
@@ -434,18 +421,12 @@ def pdb(input_set, mode, impute):
434
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
435
  with_pdb = None
436
 
437
- print('dfM')
438
- print(dfM.to_string())
439
- print('dfNM')
440
- print(dfNM)
441
  print('Aligning sequences...\n')
442
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
443
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
444
 
445
- print('aligned_m')
446
- print(aligned_m.to_string())
447
- print('aligned_nm')
448
- print(aligned_nm.to_string())
449
 
450
 
451
  # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
@@ -488,9 +469,7 @@ def pdb(input_set, mode, impute):
488
  yes_pdb_no_match = after_up_pdb_alignment[
489
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
490
  no_pdb = no_pdb.copy()
491
-
492
- print('-----PDB ALIGNED-----')
493
- print(pdb_aligned.to_string())
494
 
495
  print('PDB matching is completed...\n')
496
  print('SUMMARY')
@@ -892,7 +871,6 @@ def pdb(input_set, mode, impute):
892
  if protein not in existing_modbase_models:
893
  print('Downloading Modbase models for ', protein)
894
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
895
- print(url)
896
  req = requests.get(url)
897
  name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
898
  with open(name, 'wb') as f:
@@ -1389,7 +1367,6 @@ def pdb(input_set, mode, impute):
1389
 
1390
  aligner = Align.PairwiseAligner()
1391
  print('Proceeding to 3D distance calculation...\n')
1392
- print(data.to_string())
1393
  data.domainEndonPDB = data.domainEndonPDB.astype(str)
1394
  data.domainStartonPDB = data.domainStartonPDB.astype(str)
1395
 
@@ -1419,8 +1396,7 @@ def pdb(input_set, mode, impute):
1419
  pdbID = data.at[i, 'pdbID']
1420
 
1421
  alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
1422
- print('main_alignments')
1423
- print(list(alignments))
1424
  mutPos = data.at[i, 'mutationPositionOnPDB']
1425
  try:
1426
  coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
@@ -1549,7 +1525,6 @@ def pdb(input_set, mode, impute):
1549
  data.positions = data.positions.astype('str')
1550
  for i in data.index:
1551
  if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
1552
- print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
1553
  data.at[i, 'threeState_trsh4_HQ'] = 'interface'
1554
  elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
1555
  data.at[i, 'threeState_trsh4_HQ'] = 'surface'
 
163
  data.at[i, 'wt_sequence_match'] = 'i'
164
  data.at[i, 'whichIsoform'] = whichIsoform
165
  break
166
+
 
167
  data.wt_sequence_match = data.wt_sequence_match.astype('str')
168
  data.replace({'': 'nan'}, inplace=True)
169
  data_size = len(data.drop_duplicates(['datapoint']))
 
195
  pdbs = [item for sublist in pdbs for item in sublist]
196
 
197
  else:
 
198
  pdbs = []
199
  print('Processing PDB structures...\n')
200
  if pdbs == []:
 
272
  if chain_id in pdb_data_list:
273
  # Print UniProt IDs, chain ID, and resolution for the current model
274
  chain_id = chain.get_id()
275
+
 
 
 
 
 
276
  pdb_fasta.at[index, 'pdbID'] = search
277
  pdb_fasta.at[index, 'chain'] = chain_id
278
  pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id])
 
282
  pdb_info.at[index, 'resolution'] = resolution
283
  index += 1
284
 
 
 
 
 
285
  print('PDB file processing finished..')
286
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
287
  try:
 
297
  filename.rename(filename_replace_ext.with_suffix('.pdb'))
298
  except:
299
  FileNotFoundError
300
+
 
301
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
302
  uniprot_matched = uniprot_matched.astype(str)
303
  uniprot_matched = uniprot_matched.drop_duplicates()
304
  uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
305
  uniprot_matched = uniprot_matched.astype(str)
306
+
 
307
  with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
308
  (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
309
  uniprot_matched.resolution != 'None'))].drop_duplicates()
 
421
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
422
  with_pdb = None
423
 
424
+
 
 
 
425
  print('Aligning sequences...\n')
426
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
427
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
428
 
429
+
 
 
 
430
 
431
 
432
  # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
 
469
  yes_pdb_no_match = after_up_pdb_alignment[
470
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
471
  no_pdb = no_pdb.copy()
472
+
 
 
473
 
474
  print('PDB matching is completed...\n')
475
  print('SUMMARY')
 
871
  if protein not in existing_modbase_models:
872
  print('Downloading Modbase models for ', protein)
873
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
 
874
  req = requests.get(url)
875
  name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
876
  with open(name, 'wb') as f:
 
1367
 
1368
  aligner = Align.PairwiseAligner()
1369
  print('Proceeding to 3D distance calculation...\n')
 
1370
  data.domainEndonPDB = data.domainEndonPDB.astype(str)
1371
  data.domainStartonPDB = data.domainStartonPDB.astype(str)
1372
 
 
1396
  pdbID = data.at[i, 'pdbID']
1397
 
1398
  alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
1399
+
 
1400
  mutPos = data.at[i, 'mutationPositionOnPDB']
1401
  try:
1402
  coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
 
1525
  data.positions = data.positions.astype('str')
1526
  for i in data.index:
1527
  if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
 
1528
  data.at[i, 'threeState_trsh4_HQ'] = 'interface'
1529
  elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
1530
  data.at[i, 'threeState_trsh4_HQ'] = 'surface'