import warnings import numpy as np from loguru import logger from sklearn.ensemble import RandomForestRegressor from rdkit.Chem import Descriptors, rdMolDescriptors import joblib from transformation import TransformFunction from rdkit import Chem, rdBase, DataStructs from rdkit.Chem import AllChem from typing import List def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False): """ Create ECFP fingerprint of a molecule """ if hashed: fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size) else: fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size) fp_np = np.zeros((1,)) DataStructs.ConvertToNumpyArray(fp_bits, fp_np) return fp_np.reshape(1, -1) def fingerprints_from_smiles(smiles: List, size=2048): """ Create ECFP fingerprints of smiles, with validity check """ fps = [] valid_mask = [] for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) valid_mask.append(int(mol is not None)) fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size)) fps.append(fp) fps = np.concatenate(fps, axis=0) if len(fps) > 0 else np.zeros((0, size)) return fps, valid_mask def getMolDescriptors(mol, missingVal=0): """ calculate the full list of descriptors for a molecule """ values, names = [], [] for nm, fn in Descriptors._descList: try: val = fn(mol) except: val = missingVal values.append(val) names.append(nm) custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD, 'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA, 'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,} for nm, fn in custom_descriptors.items(): try: val = fn(mol) except: val = missingVal values.append(val) names.append(nm) return values, names def get_pep_dps_from_smi(smi): try: mol = Chem.MolFromSmiles(smi) except: print(f"convert smi {smi} to molecule failed!") mol = None dps, _ = getMolDescriptors(mol) return np.array(dps) def get_pep_dps(smi_list): if len(smi_list) == 0: return np.zeros((0, 211)) return np.array([get_pep_dps_from_smi(smi) for smi in smi_list]) """def get_smi_from_helms(helm_seqs: list): valid_idxes = [] valid_smiles = [] for idx, helm in enumerate(helm_seqs): # Ignore helm which cannot converted into molecules try: smi = get_cycpep_smi_from_helm(helm) if smi: valid_idxes.append(idx) valid_smiles.append(smi) except Exception as e: # logger.debug(f'Error: {e} in helm {helm}') pass return valid_smiles, valid_idxes""" def check_smi_validity(smiles: list): valid_smi, valid_idx = [], [] for idx, smi in enumerate(smiles): try: mol = Chem.MolFromSmiles(smi) if smi else None if mol: valid_smi.append(smi) valid_idx.append(idx) except Exception as e: # logger.debug(f'Error: {e} in smiles {smi}') pass return valid_smi, valid_idx