# Copyright (c) 2013-2020, SIB - Swiss Institute of Bioinformatics and
#                          Biozentrum - University of Basel
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from promod3 import loop
import os
import traceback
from ost import io
from ost import seq
from ost import mol
from ost import conop

# This script generates a StructureDB using a list of nonredundant pdb
# files. All pdb files get cleaned using Molck from openstructure with
# with settings specified below.
# Please note, that the values for the structure profile in the produced
# db are only allocated and simply set to zero.
# You therefore need to create an initial db, from which you can create
# these structure profiles and set them in a second step.
# This should be pretty well described in the documentation.

# To obtain the required pdb and hhm files, you need to run 
# the get_data_from_smtl.py script. As an alternative you fetch your own
# structures from the pdb and obtain the hhm file using the hh-suite
# (https://github.com/soedinglab/hh-suite) and hack this script that
# it uses your input directly.

# file generated by get_data_from_smtl.py (or by yourself)
# every line requires to be in the format: pdb_id chain_name
# e.g. 5UZG A
data = open("cullpdb_pc60_res2.5_R1.0_d180524_chains25634_data_extracted_from_smtl.txt", 'r').readlines()

# directory containing the structures. naming: pdb_id.pdb
# (pdb_id in upper case!) e.g. 5UZG.pdb
structure_dir = "structures"

# directory containing the hhm files for chain. naming pdb_id+chain_id.hhm
# (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm
profile_dir = "hmms" 

# molck settings to process the input structures
molck_settings = mol.alg.MolckSettings()
molck_settings.rm_unk_atoms = True
molck_settings.rm_non_std = True
molck_settings.rm_hyd_atoms = True
molck_settings.rm_oxt_atoms = True
molck_settings.rm_zero_occ_atoms = True
molck_settings.map_nonstd_res = True
molck_settings.assign_elem = True

# Assumes that a proper compound lib has been specified when 
# compiling openstructure
compound_lib = conop.GetDefaultLib()

# Do it!!
structure_db = loop.StructureDB(loop.StructureDBDataType.All)

for i,line in enumerate(data):

  split_line = line.split()
  prot_id = split_line[0]
  chain_id = split_line[1]

  try:

    print('processing: ',prot_id,' on line: ',i)

    prot_path = os.path.join(structure_dir, prot_id+'.pdb')
    hhm_path = os.path.join(profile_dir,prot_id+chain_id+".hhm")

    if not os.path.exists(prot_path):
      print("Could not find structure file... skip...")
      continue
    if not os.path.exists(prot_path):
      print("Could not find hhm file... skip...")
      continue

    # load and clean full structure
    prot = io.LoadPDB(prot_path)
    mol.alg.Molck(prot, compound_lib, molck_settings)

    # we're only interested in the peptide chains...
    prot = prot.Select("peptide=true")

    # get profile and seqres
    hmm = io.LoadSequenceProfile(hhm_path)
    raw_seqres = hmm.sequence
    seqres = seq.CreateSequence("seqres", raw_seqres)

    # add it
    structure_db.AddCoordinates(prot_id, chain_id, 
                                prot, seqres, hmm) 
     
  except:
    traceback.print_exc()


structure_db.PrintStatistics()
structure_db.SavePortable('portable_structure_db_60.dat')


