PepTune / scoring /functions /scoring_utils.py
Sophia Tang
model upload
e54915d
import warnings
import numpy as np
from loguru import logger
from sklearn.ensemble import RandomForestRegressor
from rdkit.Chem import Descriptors, rdMolDescriptors
import joblib
from transformation import TransformFunction
from rdkit import Chem, rdBase, DataStructs
from rdkit.Chem import AllChem
from typing import List
def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
"""
Create ECFP fingerprint of a molecule
"""
if hashed:
fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
else:
fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
fp_np = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
return fp_np.reshape(1, -1)
def fingerprints_from_smiles(smiles: List, size=2048):
""" Create ECFP fingerprints of smiles, with validity check """
fps = []
valid_mask = []
for i, smile in enumerate(smiles):
mol = Chem.MolFromSmiles(smile)
valid_mask.append(int(mol is not None))
fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
fps.append(fp)
fps = np.concatenate(fps, axis=0) if len(fps) > 0 else np.zeros((0, size))
return fps, valid_mask
def getMolDescriptors(mol, missingVal=0):
""" calculate the full list of descriptors for a molecule """
values, names = [], []
for nm, fn in Descriptors._descList:
try:
val = fn(mol)
except:
val = missingVal
values.append(val)
names.append(nm)
custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
for nm, fn in custom_descriptors.items():
try:
val = fn(mol)
except:
val = missingVal
values.append(val)
names.append(nm)
return values, names
def get_pep_dps_from_smi(smi):
try:
mol = Chem.MolFromSmiles(smi)
except:
print(f"convert smi {smi} to molecule failed!")
mol = None
dps, _ = getMolDescriptors(mol)
return np.array(dps)
def get_pep_dps(smi_list):
if len(smi_list) == 0:
return np.zeros((0, 211))
return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
"""def get_smi_from_helms(helm_seqs: list):
valid_idxes = []
valid_smiles = []
for idx, helm in enumerate(helm_seqs):
# Ignore helm which cannot converted into molecules
try:
smi = get_cycpep_smi_from_helm(helm)
if smi:
valid_idxes.append(idx)
valid_smiles.append(smi)
except Exception as e:
# logger.debug(f'Error: {e} in helm {helm}')
pass
return valid_smiles, valid_idxes"""
def check_smi_validity(smiles: list):
valid_smi, valid_idx = [], []
for idx, smi in enumerate(smiles):
try:
mol = Chem.MolFromSmiles(smi) if smi else None
if mol:
valid_smi.append(smi)
valid_idx.append(idx)
except Exception as e:
# logger.debug(f'Error: {e} in smiles {smi}')
pass
return valid_smi, valid_idx