I've been thinking for a long time. I have it as a variable in the program, but I want to make it CSV because it shuts down. I mean, considering cross-search, I want to put it in the DB. But it's a pain to write mutual conversions one by one ...
This is what I was able to think of, such as creating a table from column information at a minimum and plunging the value into it.
First, pick up the column names from the CSV file. As expected, I do not want to automatically determine the type of each column, so specify it. Well, since the purpose is machine learning, the default form of columns is double precision floating point. So, specify integer type and text type columns. Then, based on that, use CREATE TABLE, INSERT INTO from the column name you just got. (Well, you can use Pandas for INSERT, but it's easier to do various other things if you make this flow.)
CalculateDescriptors.py
import os
import pandas as pd
# Returns the compound
# I: SDF path
# O: Compound object list
def get_mols(sdfpath):
from rdkit import Chem
mols = [mol for mol in Chem.SDMolSupplier(sdfpath) if mol is not None]
return mols
# Returns basic information about a compound [Compound name, structural information, number of atoms, number of bonds, SMILES, InChI]
# I: Compound object list
# O: Result data
def get_values_base(mols):
from rdkit import Chem
columns = ['Name', 'Structure', 'Atoms', 'Bonds', 'SMILES', 'InChI']
values = list()
for mol in mols:
tmp = list()
tmp.append(mol.GetProp('_Name'))
tmp.append(Chem.MolToMolBlock(mol))
tmp.append(mol.GetNumAtoms())
tmp.append(mol.GetNumBonds())
tmp.append(Chem.MolToSmiles(mol))
tmp.append(Chem.MolToInchi(mol))
values.append(tmp)
index = [i for i in range(len(mols))]
df = pd.DataFrame(values, columns=columns, index=index)
return df
# Returns the external parameters of the compound
# I: Compound object list
# O: Result data
def get_values_external(mols):
from rdkit import Chem
columns = ['ID', 'NAME', 'SOL', 'SMILES', 'SOL_classification']
values = list()
for mol in mols:
tmp = list()
for column in columns:
tmp.append(mol.GetProp(column))
values.append(tmp)
columns = ['ext_' + column for column in columns]
index = [i for i in range(len(mols))]
df = pd.DataFrame(values, columns=columns, index=index)
return df
# Calculate descriptor: RDKit
# I: Compound object list
# O: Result data
def get_rdkit_descriptors(mols):
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
# RDKit descriptor calculation
# names = [mol.GetProp('_Name') for mol in mols]
descLists = [desc_name[0] for desc_name in Descriptors._descList]
calcs = MoleculeDescriptors.MolecularDescriptorCalculator(descLists)
values = [calcs.CalcDescriptors(mol) for mol in mols]
Convert to #DataFrame
index = [i for i in range(len(mols))]
df = pd.DataFrame(values, columns=descLists, index=index)
return df
# Calculate descriptor: mordred
# I: Compound object list
# O: Result data
def get_mordred_descriptors(mols):
Calculation of # mordred descriptor
from mordred import Calculator, descriptors
calcs = Calculator(descriptors, ignore_3D=False)
df = calcs.pandas(mols)
df['index'] = [i for i in range(len(mols))]
df.set_index('index', inplace=True)
return df
# Calculate descriptor: CDK
# I: SDF file
# java executable file path
# CDK jar file path
# O: Result data
def get_cdk_descriptors(sdfpath, workfolderpath, java_path, cdk_jar_path):
filepath = os.path.join(workfolderpath, 'tmp.csv')
import subprocess
command = f'{java_path} -jar {cdk_jar_path} -b {sdfpath} -t all -o {filepath}'
print(command)
subprocess.run(command, shell=False)
df = pd.read_table(filepath)
os.remove(filepath)
return df
# Main processing
def main():
data_folderpath = 'D:\\data\\python_data\\chem'
sdfpath = os.path.join(data_folderpath, 'sdf\\solubility.test.20.sdf')
csvpath = 'solubility.test.csv'
java_path = 'C:\\Program Files\\Java\\jdk-14.0.1\\bin\\java.exe'
workfolderpath = os.path.dirname(os.path.abspath(__file__))
cdk_jar_path = os.path.join(data_folderpath, 'jar\\CDKDescUI-1.4.6.jar')
# Compound acquisition
mols = get_mols(sdfpath)
# Get each value
# (python library)
dfs = list()
for calcs in [get_values_base, get_values_external, get_rdkit_descriptors, get_mordred_descriptors]:
dfs.append(calcs(mols))
# (jar file calculation)
dfs.append(get_cdk_descriptors(sdfpath, workfolderpath, java_path, cdk_jar_path))
# Combine all
df = pd.concat(dfs, axis=1)
df.to_csv('all_parameters.csv')
print(df)
# Start process
if __name__ == '__main__':
main()
Also one.
CreateSQL.py
import ReadCSVColumn
# Returns SQL: DROP
# I: Table name
O : SQL
def get_sql_drop_table(table_name):
sql = f'DROP TABLE IF EXISTS {table_name}'
return sql
# Returns SQL: SELECT
# I: Table name
# Column name list
O : SQL
def get_sql_select(table_name, columns):
sql = f'''
SELECT
{','.join(columns)}
FROM
{table_name}
'''
return sql
# Returns SQL: INSERT
# I: Table name
# Column name list
O : SQL
def get_sql_insert_table(table_name, columns):
sql = f'''
INSERT INTO
{table_name}
(
{','.join(columns)}
)
VALUES
(
{','.join(['%s' for column in columns])}
)
'''
return sql
# Returns SQL: CREATE TABLE
# I: Table name
# Column name list
# Column name list (repost: integer type)
# Column name list (repost: text type)
O : SQL
def get_sql_create_table(table_name, columns, columns_int, columns_text):
sql_columns = list()
for column in columns:
if column in columns_int:
sql_columns.append(column + ' ' + 'integer')
elif column in columns_text:
sql_columns.append(column + ' ' + 'text')
else:
sql_columns.append(column + ' ' + 'double precision')
sql = f'''
CREATE TABLE {table_name}
(
{','.join(sql_columns)}
)
'''
return sql
# Main processing
def main():
csvpath = 'solubility.test.csv'
columns = ReadCSVColumn.get_columns_list(csvpath)
columns_int = ['SampleID', 'Atoms', 'Bonds']
columns_text = ['SampleName', 'Structure', 'ID', 'NAME', 'SOL', 'SMILES', 'SOL_classification']
sql = get_sql_create_table('RDKit', columns, columns_int, columns_text)
print(sql)
# Start process
if __name__ == '__main__':
main()
Oh, there was one too.
ReadCSVColumn.py
# Read a text file in line list format
# I: File path
# O: Text list
def get_lines_from_textfile(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
return lines
# Returns the first line from the CSV path list as a read item list
# I: CSV path
# O: Item list
def get_columns_list(csvpath):
# There was a line break in the last item, so remove it
line_1st = get_lines_from_textfile(csvpath)[0]
# There is a line break at the end of the first line, so remove it
line_1st = line_1st.replace('\n', '')
return line_1st.split(',')
# Main processing
def main():
csvpath = 'solubility.test.csv'
columns = get_columns_list(csvpath)
for column in columns:
print(column)
# Start process
if __name__ == '__main__':
main()
If you move all of this, you should create a table of the same type as the first CSV on PostgreSQL and insert it. After that, JOIN with the master on the DB and GROUP BY freely.
Well ... I think it's best if Python runs on PostgreSQL ... Well, it's a future issue.
I feel like I've finally created a decent frame. Well, next time, I'll do something a little closer to the main subject.
Recommended Posts