Hello everyone
I need to collect data out of a bunch of csv files (16 GB) and make with the usful data some new csv files. To achieve this I would like to acess the data with a python code in my ubuntu subsystem environment and then ananlyze the data with mysql workbench on windows. However as for now I do not understand how I can link mysql workbench with the gathered data in the subsystem. How can I achieve this?
For the gathering of the data I use the following python code:
# -*- coding: utf-8 -*-
#
# The script will drop any existing table with the same name and schema then
# set in the settings, then read all csv files that contain the key word
# (string) and create a new table appending all data from the fitting csv files
##############################################################################
from sqlalchemy import create_engine
import import_csv_to_sql_functions as sqlf
data_dir = "path"
#
# db username
db_username = "root"
#
# db password
db_password = "password"
#
# filtering key word (only files containing this will be included)
filtering_kw = "Unavailability"
#
# name of mysql schema to add table
mysql_schema = "raw"
#
# name of mysql table to add data in
mysql_table = "generator_data"
#
##############################################################################
if __name__ == '__main__':
engine = create_engine('mysql+pymysql://'+db_username+':'+\
db_password+'@localhost/')
connection = engine.connect()
sqlf.insert_into_sql_table(engine, data_dir, mysql_schema, mysql_table,
filtering_kw)
print("Done!")
With the function defined here:
# -*- coding: utf-8 -*-
import os
import time
import pandas as pd
# function used to insert data into sql database
def insert_into_sql_table(engine, data_dir, mysql_schema, mysql_table,
filtering_kw = None):
# drop table if already existant
connection = engine.connect()
connection.execute("DROP TABLE IF EXISTS "+mysql_schema+"."+mysql_table+";")
connection.close()
files = os.listdir(data_dir)
for file in files:
# checking if keyword is in filename, else file will not be imported
if ((filtering_kw is not None and filtering_kw in file) or (filtering_kw is None)):
print("##############################################")
print(file)
start_time = time.time()
# load csv file as pandas dataframe
df = pd.read_csv(os.path.join(data_dir, file), sep = '\t')
intermediate_time = time.time()
# insert dataframe into mysql database
df.to_sql(name = mysql_table, con = engine, schema = mysql_schema,
if_exists = 'append', chunksize = 10000, method = 'multi', index = False)
end_time = time.time()
print("csv load time: ", round(intermediate_time-start_time,0), "s")
print("Insert time: ", round(end_time - intermediate_time,0), "s")
print("Total Time: ", round((end_time - start_time)/60,1), "m")
return True
I would grealy appraciate any help or tipps.
Best
Davdi
I need to collect data out of a bunch of csv files (16 GB) and make with the usful data some new csv files. To achieve this I would like to acess the data with a python code in my ubuntu subsystem environment and then ananlyze the data with mysql workbench on windows. However as for now I do not understand how I can link mysql workbench with the gathered data in the subsystem. How can I achieve this?
For the gathering of the data I use the following python code:
# -*- coding: utf-8 -*-
#
# The script will drop any existing table with the same name and schema then
# set in the settings, then read all csv files that contain the key word
# (string) and create a new table appending all data from the fitting csv files
##############################################################################
from sqlalchemy import create_engine
import import_csv_to_sql_functions as sqlf
data_dir = "path"
#
# db username
db_username = "root"
#
# db password
db_password = "password"
#
# filtering key word (only files containing this will be included)
filtering_kw = "Unavailability"
#
# name of mysql schema to add table
mysql_schema = "raw"
#
# name of mysql table to add data in
mysql_table = "generator_data"
#
##############################################################################
if __name__ == '__main__':
engine = create_engine('mysql+pymysql://'+db_username+':'+\
db_password+'@localhost/')
connection = engine.connect()
sqlf.insert_into_sql_table(engine, data_dir, mysql_schema, mysql_table,
filtering_kw)
print("Done!")
With the function defined here:
# -*- coding: utf-8 -*-
import os
import time
import pandas as pd
# function used to insert data into sql database
def insert_into_sql_table(engine, data_dir, mysql_schema, mysql_table,
filtering_kw = None):
# drop table if already existant
connection = engine.connect()
connection.execute("DROP TABLE IF EXISTS "+mysql_schema+"."+mysql_table+";")
connection.close()
files = os.listdir(data_dir)
for file in files:
# checking if keyword is in filename, else file will not be imported
if ((filtering_kw is not None and filtering_kw in file) or (filtering_kw is None)):
print("##############################################")
print(file)
start_time = time.time()
# load csv file as pandas dataframe
df = pd.read_csv(os.path.join(data_dir, file), sep = '\t')
intermediate_time = time.time()
# insert dataframe into mysql database
df.to_sql(name = mysql_table, con = engine, schema = mysql_schema,
if_exists = 'append', chunksize = 10000, method = 'multi', index = False)
end_time = time.time()
print("csv load time: ", round(intermediate_time-start_time,0), "s")
print("Insert time: ", round(end_time - intermediate_time,0), "s")
print("Total Time: ", round((end_time - start_time)/60,1), "m")
return True
I would grealy appraciate any help or tipps.
Best
Davdi