[docs]defname_to_smiles(name:str)->str:"""Convert from chemical name to SMILES string using chemical identifier resolver. This script is useful to combine with ``df.apply`` from pandas, hence it does not throw exceptions for invalid molecules but instead returns an empty string for easy subsequent postprocessing of the dataframe. Args: name: Name or nickname of compound. Returns: SMILES string corresponding to chemical name. """name=name.replace(" ","%20")try:url="http://cactus.nci.nih.gov/chemical/structure/"+name+"/smiles"ctx=ssl.create_default_context()ctx.check_hostname=Falsectx.verify_mode=ssl.CERT_NONEwithurllib.request.urlopen(url,context=ctx)asweb:smiles=web.read().decode("utf8")smiles=str(smiles)if"</div>"insmiles:return""returnsmilesexceptException:return""
@lru_cache(maxsize=None)@_disk_cachedef_smiles_to_mordred_features(smiles:str)->np.ndarray:"""Memory- and disk-cached computation of Mordred descriptors. Args: smiles: SMILES string. Returns: Mordred descriptors for the given smiles string. """try:returnnp.asarray(_mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing())exceptException:returnnp.full(len(_mordred_calculator.descriptors),np.nan)
[docs]defsmiles_to_mordred_features(smiles_list:list[str],prefix:str="",dropna:bool=True,)->pd.DataFrame:"""Compute Mordred chemical descriptors for a list of SMILES strings. Args: smiles_list: List of SMILES strings. prefix: Name prefix for each descriptor (e.g., nBase --> <prefix>_nBase). dropna: If ``True``, drops columns that contain NaNs. Returns: Dataframe containing overlapping Mordred descriptors for each SMILES string. """features=[_smiles_to_mordred_features(smiles)forsmilesinsmiles_list]descriptor_names=list(_mordred_calculator.descriptors)columns=[prefix+"MORDRED_"+str(name)fornameindescriptor_names]dataframe=pd.DataFrame(data=features,columns=columns,dtype=DTypeFloatNumpy)ifdropna:dataframe=dataframe.dropna(axis=1)returndataframe
[docs]defsmiles_to_molecules(smiles_list:list[str])->list[Chem.Mol]:"""Convert a given list of SMILES strings into corresponding Molecule objects. Args: smiles_list: List of SMILES strings. Returns: List of corresponding molecules. Raises: ValueError: If the SMILES does not seem to be chemically valid. """mols=[]forsmilesinsmiles_list:try:mol=Chem.MolFromSmiles(smiles)ifmolisNone:raiseValueError()mols.append(mol)exceptExceptionasex:raiseValueError(f"The SMILES {smiles} does not seem to be chemically valid.")fromexreturnmols
[docs]defsmiles_to_rdkit_features(smiles_list:list[str],prefix:str="",dropna:bool=True)->pd.DataFrame:"""Compute RDKit chemical descriptors for a list of SMILES strings. Args: smiles_list: List of SMILES strings. prefix: Name prefix for each descriptor (e.g., nBase --> <prefix>_nBase). dropna: If ``True``, drops columns that contain NaNs. Returns: Dataframe containing overlapping RDKit descriptors for each SMILES string. """mols=smiles_to_molecules(smiles_list)res=[]formolinmols:desc={prefix+"RDKIT_"+dname:DTypeFloatNumpy(func(mol))fordname,funcinChem.Descriptors.descList}res.append(desc)df=pd.DataFrame(res)ifdropna:df=df.dropna(axis=1)returndf
[docs]defsmiles_to_fp_features(smiles_list:list[str],prefix:str="",dtype:type[int]|type[float]=int,radius:int=4,n_bits:int=1024,)->pd.DataFrame:"""Compute standard Morgan molecule fingerprints for a list of SMILES strings. Args: smiles_list: List of SMILES strings. prefix: Name prefix for each descriptor (e.g., nBase --> <prefix>_nBase). dtype: Specifies whether fingerprints will have int or float data type. radius: Radius for the Morgan fingerprint. n_bits:Number of bits for the Morgan fingerprint. Returns: Dataframe containing Morgan fingerprints for each SMILES string. """mols=smiles_to_molecules(smiles_list)res=[]formolinmols:RDLogger.logger().setLevel(RDLogger.CRITICAL)fingerp=GetMorganFingerprintAsBitVect(mol,radius,nBits=n_bits).ToBitString()fingerp=map(int,fingerp)fpvec=np.array(list(fingerp))res.append({prefix+"FP_"+f"{k+1}":dtype(bit)fork,bitinenumerate(fpvec)})df=pd.DataFrame(res)returndf
[docs]defis_valid_smiles(smiles:str)->bool:"""Test if a SMILES string is valid according to RDKit. Args: smiles: SMILES string to be tested. Returns: ``True`` if the provided SMILES is valid, ``False`` else. """try:mol=Chem.MolFromSmiles(smiles)returnmolisnotNoneexceptException:returnFalse
[docs]defget_canonical_smiles(smiles:str)->str:"""Return the "canonical" representation of the given SMILES."""try:returnChem.MolToSmiles(Chem.MolFromSmiles(smiles))exceptException:raiseValueError(f"The SMILES '{smiles}' does not appear to be valid.")