Python hand play (interoperability between CSV and PostgreSQL)

What is this article?

I've been thinking for a long time. I have it as a variable in the program, but I want to make it CSV because it shuts down. I mean, considering cross-search, I want to put it in the DB. But it's a pain to write mutual conversions one by one ...

This is what I was able to think of, such as creating a table from column information at a minimum and plunging the value into it.

So what did you make?

First, pick up the column names from the CSV file. As expected, I do not want to automatically determine the type of each column, so specify it. Well, since the purpose is machine learning, the default form of columns is double precision floating point. So, specify integer type and text type columns. Then, based on that, use CREATE TABLE, INSERT INTO from the column name you just got. (Well, you can use Pandas for INSERT, but it's easier to do various other things if you make this flow.)

code

 CalculateDescriptors.py
import os
import pandas as pd


# Returns the compound
# I: SDF path
# O: Compound object list
def get_mols(sdfpath):
    from rdkit import Chem
    mols = [mol for mol in Chem.SDMolSupplier(sdfpath) if mol is not None]
    return mols


# Returns basic information about a compound [Compound name, structural information, number of atoms, number of bonds, SMILES, InChI]
# I: Compound object list
# O: Result data
def get_values_base(mols):
    from rdkit import Chem
    columns = ['Name', 'Structure', 'Atoms', 'Bonds', 'SMILES', 'InChI']
    values = list()
    for mol in mols:
        tmp = list()
        tmp.append(mol.GetProp('_Name'))
        tmp.append(Chem.MolToMolBlock(mol))
        tmp.append(mol.GetNumAtoms())
        tmp.append(mol.GetNumBonds())
        tmp.append(Chem.MolToSmiles(mol))
        tmp.append(Chem.MolToInchi(mol))
        values.append(tmp)
    index = [i for i in range(len(mols))]
    df = pd.DataFrame(values, columns=columns, index=index)
    return df


# Returns the external parameters of the compound
# I: Compound object list
# O: Result data
def get_values_external(mols):
    from rdkit import Chem
    columns = ['ID', 'NAME', 'SOL', 'SMILES', 'SOL_classification']
    values = list()
    for mol in mols:
        tmp = list()
        for column in columns:
            tmp.append(mol.GetProp(column))
        values.append(tmp)
    columns = ['ext_' + column for column in columns]
    index = [i for i in range(len(mols))]
    df = pd.DataFrame(values, columns=columns, index=index)
    return df


# Calculate descriptor: RDKit
# I: Compound object list
# O: Result data
def get_rdkit_descriptors(mols):
    from rdkit.Chem import AllChem, Descriptors
    from rdkit.ML.Descriptors import MoleculeDescriptors
 # RDKit descriptor calculation
    # names = [mol.GetProp('_Name') for mol in mols]
    descLists = [desc_name[0] for desc_name in Descriptors._descList]
    calcs = MoleculeDescriptors.MolecularDescriptorCalculator(descLists)
    values = [calcs.CalcDescriptors(mol) for mol in mols]
 Convert to #DataFrame
    index = [i for i in range(len(mols))]
    df = pd.DataFrame(values, columns=descLists, index=index)
    return df


# Calculate descriptor: mordred
# I: Compound object list
# O: Result data
def get_mordred_descriptors(mols):
 Calculation of # mordred descriptor
    from mordred import Calculator, descriptors
    calcs = Calculator(descriptors, ignore_3D=False)
    df = calcs.pandas(mols)
    df['index'] = [i for i in range(len(mols))]
    df.set_index('index', inplace=True)
    return df


# Calculate descriptor: CDK
# I: SDF file
# java executable file path
# CDK jar file path
# O: Result data
def get_cdk_descriptors(sdfpath, workfolderpath, java_path, cdk_jar_path):
    filepath = os.path.join(workfolderpath, 'tmp.csv')
    import subprocess
    command = f'{java_path} -jar {cdk_jar_path} -b {sdfpath} -t all -o {filepath}'
    print(command)
    subprocess.run(command, shell=False)
    df = pd.read_table(filepath)
    os.remove(filepath)
    return df


# Main processing
def main():
    data_folderpath = 'D:\\data\\python_data\\chem'
    sdfpath = os.path.join(data_folderpath, 'sdf\\solubility.test.20.sdf')
    csvpath = 'solubility.test.csv'

    java_path = 'C:\\Program Files\\Java\\jdk-14.0.1\\bin\\java.exe'
    workfolderpath = os.path.dirname(os.path.abspath(__file__))
    cdk_jar_path = os.path.join(data_folderpath, 'jar\\CDKDescUI-1.4.6.jar')

 # Compound acquisition
    mols = get_mols(sdfpath)

 # Get each value
 # (python library)
    dfs = list()
    for calcs in [get_values_base, get_values_external, get_rdkit_descriptors, get_mordred_descriptors]:
        dfs.append(calcs(mols))

 # (jar file calculation)
    dfs.append(get_cdk_descriptors(sdfpath, workfolderpath, java_path, cdk_jar_path))

 # Combine all
    df = pd.concat(dfs, axis=1)
    df.to_csv('all_parameters.csv')
    print(df)


# Start process
if __name__ == '__main__':
    main()

Also one.

 CreateSQL.py
import ReadCSVColumn


# Returns SQL: DROP
# I: Table name
 O : SQL
def get_sql_drop_table(table_name):
    sql = f'DROP TABLE IF EXISTS {table_name}'
    return sql


# Returns SQL: SELECT
# I: Table name
# Column name list
 O : SQL
def get_sql_select(table_name, columns):
    sql = f'''
        SELECT
                {','.join(columns)}
        FROM
                {table_name}
    '''
    return sql


# Returns SQL: INSERT
# I: Table name
# Column name list
 O : SQL
def get_sql_insert_table(table_name, columns):
    sql = f'''
        INSERT INTO
                {table_name}
                (
                    {','.join(columns)}
                )
        VALUES
                (
                    {','.join(['%s' for column in columns])}
                )
    '''
    return sql


# Returns SQL: CREATE TABLE
# I: Table name
# Column name list
# Column name list (repost: integer type)
# Column name list (repost: text type)
 O : SQL
def get_sql_create_table(table_name, columns, columns_int, columns_text):
    sql_columns = list()
    for column in columns:
        if column in columns_int:
            sql_columns.append(column + ' ' + 'integer')
        elif column in columns_text:
            sql_columns.append(column + ' ' + 'text')
        else:
            sql_columns.append(column + ' ' + 'double precision')
    sql = f'''
        CREATE TABLE {table_name}
        (
            {','.join(sql_columns)}
        )
    '''
    return sql


# Main processing
def main():
    csvpath = 'solubility.test.csv'
    columns = ReadCSVColumn.get_columns_list(csvpath)
    columns_int = ['SampleID', 'Atoms', 'Bonds']
    columns_text = ['SampleName', 'Structure', 'ID', 'NAME', 'SOL', 'SMILES', 'SOL_classification']
    sql = get_sql_create_table('RDKit', columns, columns_int, columns_text)
    print(sql)


# Start process
if __name__ == '__main__':
    main()

Oh, there was one too.

 ReadCSVColumn.py


# Read a text file in line list format
# I: File path
# O: Text list
def get_lines_from_textfile(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return lines


# Returns the first line from the CSV path list as a read item list
# I: CSV path
# O: Item list
def get_columns_list(csvpath):
 # There was a line break in the last item, so remove it
    line_1st = get_lines_from_textfile(csvpath)[0]
 # There is a line break at the end of the first line, so remove it
    line_1st = line_1st.replace('\n', '')
    return line_1st.split(',')


# Main processing
def main():
    csvpath = 'solubility.test.csv'
    columns = get_columns_list(csvpath)
    for column in columns:
        print(column)


# Start process
if __name__ == '__main__':
    main()

If you move all of this, you should create a table of the same type as the first CSV on PostgreSQL and insert it. After that, JOIN with the master on the DB and GROUP BY freely.

Well ... I think it's best if Python runs on PostgreSQL ... Well, it's a future issue.

Impressions

I feel like I've finally created a decent frame. Well, next time, I'll do something a little closer to the main subject.

Recommended Posts

Python hand play (interoperability between CSV and PostgreSQL)
Python hand play (division)
Python hand play (get column names from CSV file)
Python hand play (two-dimensional list)
Python hand play (argparse minimum code)
Difference between Ruby and Python split
Python hand play (Pandas / DataFrame beginning)
Difference between java and python (memo)
Difference between == and is in python
Cooperation between python module and API
Differences between Python, stftime and strptime
Difference between python2 series and python3 series dict.keys ()
Python CSV file reading and writing
Reading and writing CSV with Python
[Python] Difference between function and method
Python --Difference between exec and eval
[Python] Difference between randrange () and randint ()
[Python] Difference between sorted and sorted (Colaboratory)
Python hand play (RDKit descriptor calculation: SDF to CSV using Pandas)
Differences in authenticity between Python and JavaScript
difference between statements (statements) and expressions (expressions) in Python
Python hand play (descriptor calculation: serious version)
Differences in syntax between Python and Java
Difference between PHP and Python finally and exit
Search and play YouTube videos in Python
Difference between @classmethod and @staticmethod in Python
Fractal to make and play with Python
Difference between append and + = in Python list
Difference between nonlocal and global in Python
[Python] Difference between class method and static method
Read Python csv and export to txt
[Python3] Switch between Shift_JIS, UTF-8 and ASCII
[Python Iroha] Difference between List and Tuple
[python] Difference between rand and randn output
Differences in multithreading between Python and Jython
Differences between Ruby and Python (basic syntax)
Load csv with pandas and play with Index
Correspondence between Python built-in functions and Rust
Exchange encrypted data between Python and C #
Python hand play (let's get started with AtCoder?)
Scraping tabelog with python and outputting to CSV
Summary of the differences between PHP and Python
Read JSON with Python and output as CSV
The answer of "1/2" is different between python2 and 3
[python] Difference between variables and self. Variables in class
Python hand play (one line notation of if)
[Python] Conversion memo between time data and numerical data
About the difference between "==" and "is" in python
Reading and writing CSV and JSON files in Python
About the difference between PostgreSQL su and sudo
Example of reading and writing CSV with Python
Play Python async
Csv in python
Full-width and half-width processing of CSV data in Python
File write speed comparison experiment between python 2.7.9 and pypy 2.5.0
[Ruby vs Python] Benchmark comparison between Rails and Flask
Control other programs from Python (communication between Python and exe)
Difference between Ruby and Python in terms of variables
Indent behavior of json.dumps is different between python2 and python3
[Ubuntu] [Python] Face detection comparison between dlib and OpenCV
[Introduction for beginners] Reading and writing Python CSV files