table des matières
Installez d'abord pyorc pour gérer les fichiers ORC en Python
pip install pyorc
Site officiel https://pypi.org/project/pyorc/
*** Dans l'exemple de programme ci-dessous, 1. Lisez les données de CSV et convertissez-les en ORC, 2. Lisez les données d'ORC créées en 1. et convertissez-les en CSV. *** *** Tout d'abord, préparez un fichier CSV comme celui-ci et nommez-le *** file source.csv ***. Veuillez mettre la chaîne de caractères entre ".
1,"Amazon","AWS",3.2
2,"MicroSoft","Azure",0.142
3,"Google","GCP",10.0
4,"SaleForce","SalesCloud",2.5
5,"Git","GitHub",0.342
import pyorc
import glob
import re
"""
Writing process
"""
with open("./target.orc", "wb") as data:
#Read source data. In this case, We'll convert CSV to ORC
with open("./source.csv","r") as source:
#Get rid of \n "return code"
lines = [i.strip() for i in source.readlines()]
records= []
header_name = []
#rows process
for line in lines:
record = []
#colums process
for column in line.split(","):
#Data process
if re.match(r'^".*"$',column):
record.append(column.strip('"'))
#header process
if line == lines[0]:
header_name.append("string")
elif re.match(r'^\d+\.\d+$',column):
record.append(float(column))
#header process
if line == lines[0]:
header_name.append("double")
elif re.match(r'^\d+$',column):
record.append(int(column))
#header process
if line == lines[0]:
header_name.append("int")
#one record datas is packed as a tuple
records.append(tuple(record))
#If we are at the first record, we'll give the column names to the ORC table
if line == lines[0]:
for i in range(len(header_name)):
header_name[i] = f"col{i}:{header_name[i]}"
header_name = f'struct<{",".join(header_name)}>'
print(records)
#Get writer Object. give ORC file object at the position of first augument,
#column names at the position of second augument "Writer" method
with pyorc.Writer(data, header_name) as writer:
for record in records:
writer.write(record)
"""
Reading process
"""
with open("./target.orc", "rb") as data:
#Get datas from ORC file without column names
reader = pyorc.Reader(data)
#Get just only column names from ORC file
columns = reader.schema.fields
#Get each column name
for column in columns:
print(column)
print(columns[column].kind)
with open("./target.csv","w") as f:
#loop row datas
records = []
for one_record_data in reader:
records.append(','.join(map(str, one_record_data)))
f.write("\n".join(records))
Installation de la bibliothèque
pip install pandas pyarrow
Créez la source suivante dans le même répertoire que target.csv créé par le résultat d'exécution de 1. et exécutez-la.
#-*- encoding:utf-8 -*-
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
"""
CSV to Parquet
"""
# CSV -> DataFrame
df = pd.read_csv("./target.csv")
# DataFrame -> Arrow Table
table = pa.Table.from_pandas(df)
# Arrow Table -> Parquet
pq.write_table(table, "target.pq")
"""
Parquet to CSV
"""
# Parquet -> Arrow Table
table2 = pq.read_table("target.pq")
# Arrow Table -> DataFrame
df2 = table.to_pandas()
#DataFrame -> CSV
csv = df2.to_csv("target2.csv")
Recommended Posts