Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/python
from __future__ import print_function
import ast
import csv
import os
import sys
import re
import getopt
import time
import gzip
SCHEMAS = {}
VERBOSE = False
class DataTables:
def __init__(self):
self.data = {}
self.schema = {}
def add_table(self, current_table_name, schema):
if not current_table_name in self.schema:
self.schema[current_table_name] = schema
self.data[current_table_name] = []
self.current_table_name = current_table_name
def add_values(self,values):
self.data[self.current_table_name].extend(values)
def get_table(self,table_name):
return self.schema[table_name], self.data[table_name]
def get_table_dicts(self,table_name):
return [ dict( zip(self.schema[table_name], values ) ) for values in self.data[table_name] ]
def dump(self):
for table_name in self.get_table_names():
self.print_table(table_name)
def get_table_names(self):
return sorted(self.data.keys())
def print_table(self,table_name, file=None):
print('', file=file)
print( '"' + '","'.join(self.schema[table_name]) + '"', file=file )
for values in make_python_list_of_sequences(self.data[table_name]):
try:
print( '"' + '","'.join((str(x) for x in values)) + '"' , file=file)
except TypeError:
# Print helpful info before dying
print("ERROR table (%s) values(%s)" % (table_name, values) )
raise
def is_create_statement(line):
return line.startswith('CREATE TABLE')
def is_field_definition(line):
return line.strip().startswith('`')
def is_insert_statement(line):
return line.startswith('INSERT INTO')
def is_python_sequence(val):
return type(val) in (type(()), type([]))
def get_mysql_name_value(line):
value = None
result = re.search(r'\`([^\`]*)\`', line)
if result:
value = result.groups()[0]
return value
def get_value_tuples(line):
string_values = line.partition(' VALUES ')[-1].strip().replace('NULL', "''")
if string_values[-1] == ';':
string_values = string_values[:-1]
# Convert string that represent Python values into Python values
return ast.literal_eval(string_values)
# Make an atom, or a list of atoms, into a list of sequences.
def make_python_list_of_sequences(val):
if is_python_sequence(val):
if is_python_sequence(val[0]):
return val
else:
return [val]
else:
return [[val]]
def parse_dump_file(file_name, opts={}):
listonly = 'list' in opts
include = opts.get('include',None)
data = DataTables()
current_table_name = None
exclude = True
if file_name=='-':
read_file = sys.stdin
elif file_name.endswith('.gz'):
read_file = gzip.open(file_name,'r')
else:
read_file = open(file_name, 'r')
prev_table_name, prev_table_lineno, prev_table_chars = None, -1, 0
if listonly: print("%12s %s" % ('Bytes', 'Table'))
for lineno, line in enumerate(read_file):
prev_table_chars += len(line)
if is_create_statement(line):
current_table_name = get_mysql_name_value(line)
if listonly:
if prev_table_name:
print("%12d %s" % (prev_table_chars, prev_table_name) )
prev_table_name, prev_table_lineno = current_table_name, lineno
prev_table_chars = 0
continue
exclude = include and current_table_name not in include
if exclude: continue
SCHEMAS[current_table_name] = []
print_verbose("Table %s: created" % current_table_name)
elif current_table_name and is_field_definition(line):
if listonly or exclude: continue
field_name = get_mysql_name_value(line)
SCHEMAS[current_table_name].append(field_name)
print_verbose("Table %s: Add column %s" % (current_table_name,field_name))
elif is_insert_statement(line):
if listonly or exclude: continue
values = get_value_tuples(line)
# Add table value (and new table if needed) to data
data.add_table(current_table_name, SCHEMAS[current_table_name])
data.add_values(values)
print_verbose("Table %s: Storing %d rows" % (current_table_name,len(values)))
if file_name!='-': read_file.close()
if listonly:
if prev_table_name:
print("%12d %s" % (prev_table_chars, prev_table_name) )
return data
def Usage():
print("""
Usage: mysqldump_to_csv.py { - | DUMP_FILE }
Read the mysql dump file DUMP_FILE, and write each table as a separate CSV file in OUTPUT_DIR.
DUMP_FILE can be in *.gz format.
NOTE: For large mysql dump (1G or larger), extracting all tables could take 10 or 20 minutes.
It will be much quicker if you use the -i option to extract only the tables you want.
OPTIONS
-i NAME1[:NAME2[:NAME3]] Only include the following table names (use : as separator)
-l Just list the tables
-o OUTDIR Write each table into its own file in outdir, otherwise to stdout.
Directory OUTDIR will be created if it doesn't exist,
EXAMPLES
# List all tables in Mysql database dump file
$ mysqldump_to_csv.py -l 20200917-2300.slurmdb.sql
acct_coord_table
acct_table
....
qos_table
res_table
user_table
# Extract specific table
# Note that
# - output directory /tmp/mysql will be created if it doesn't exist.
# - input MySQL dump file is in gzip format.
$ mysqldump_to_csv.py -i acct_table:cluster_assoc_table:qos_table:user_table -o /tmp/mysql database.gz
""")
sys.exit()
def print_verbose(msg):
global VERBOSE
if VERBOSE:
print(time.strftime(" %Y-%m-%d %H:%M:%S " + msg))
def rename_keys(dictin,oldkeys,newkeys,oldprefix):
return dict( [(new,dictin[oldprefix+old]) for old,new in zip(oldkeys,newkeys) if oldprefix+old in dictin ] )
def get_options(argsin):
opts, args = getopt.getopt(argsin,'vli:o:')
opts = dict(opts)
if '-l' in opts and len(args)!=1 or len(args)==2: Usage()
if '-i' in opts:
opts['-i'] = opts['-i'].split(":")
opts = rename_keys( opts, ('i','l','v','o'), ('include','list','verbose','outdir'), oldprefix='-' )
return opts, args
#-----------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------
if __name__ == '__main__':
opts, args = get_options(sys.argv[1:])
if not args: Usage()
VERBOSE = 'verbose' in opts
# Create output directory if it does not exist
if 'outdir' in opts and not os.path.isdir(opts['outdir']): os.makedirs(opts['outdir'])
# Parse dump file and make CSV files
data = parse_dump_file(args[0], opts)
table_names = table_names = opts['input'].split(":") if 'input' in opts else data.get_table_names()
# Just list file (currently to this from parse_dump_file, for quicker response)
if 'list' in opts:
pass
for table_name in table_names: print(table_name)
# Output csv
else:
outdir = opts.get('outdir',None)
# Print individual files for each table
if outdir:
for table_name in table_names:
fname = "%s/%s.csv" % (outdir, table_name )
with open(fname,"w") as fout:
data.print_table(table_name, file=fout)
# Print all tables to standard out
else:
for table_name in table_names:
data.print_table(table_name)