Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
generate_qos_cmds/mysqldump_to_csv.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
242 lines (187 sloc)
6.93 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import print_function | |
import ast | |
import csv | |
import os | |
import sys | |
import re | |
import getopt | |
import time | |
import gzip | |
SCHEMAS = {} | |
VERBOSE = False | |
class DataTables: | |
def __init__(self): | |
self.data = {} | |
self.schema = {} | |
def add_table(self, current_table_name, schema): | |
if not current_table_name in self.schema: | |
self.schema[current_table_name] = schema | |
self.data[current_table_name] = [] | |
self.current_table_name = current_table_name | |
def add_values(self,values): | |
self.data[self.current_table_name].extend(values) | |
def get_table(self,table_name): | |
return self.schema[table_name], self.data[table_name] | |
def get_table_dicts(self,table_name): | |
return [ dict( zip(self.schema[table_name], values ) ) for values in self.data[table_name] ] | |
def dump(self): | |
for table_name in self.get_table_names(): | |
self.print_table(table_name) | |
def get_table_names(self): | |
return sorted(self.data.keys()) | |
def print_table(self,table_name, file=None): | |
print('', file=file) | |
print( '"' + '","'.join(self.schema[table_name]) + '"', file=file ) | |
for values in make_python_list_of_sequences(self.data[table_name]): | |
try: | |
print( '"' + '","'.join((str(x) for x in values)) + '"' , file=file) | |
except TypeError: | |
# Print helpful info before dying | |
print("ERROR table (%s) values(%s)" % (table_name, values) ) | |
raise | |
def is_create_statement(line): | |
return line.startswith('CREATE TABLE') | |
def is_field_definition(line): | |
return line.strip().startswith('`') | |
def is_insert_statement(line): | |
return line.startswith('INSERT INTO') | |
def is_python_sequence(val): | |
return type(val) in (type(()), type([])) | |
def get_mysql_name_value(line): | |
value = None | |
result = re.search(r'\`([^\`]*)\`', line) | |
if result: | |
value = result.groups()[0] | |
return value | |
def get_value_tuples(line): | |
string_values = line.partition(' VALUES ')[-1].strip().replace('NULL', "''") | |
if string_values[-1] == ';': | |
string_values = string_values[:-1] | |
# Convert string that represent Python values into Python values | |
return ast.literal_eval(string_values) | |
# Make an atom, or a list of atoms, into a list of sequences. | |
def make_python_list_of_sequences(val): | |
if is_python_sequence(val): | |
if is_python_sequence(val[0]): | |
return val | |
else: | |
return [val] | |
else: | |
return [[val]] | |
def parse_dump_file(file_name, opts={}): | |
listonly = 'list' in opts | |
include = opts.get('include',None) | |
data = DataTables() | |
current_table_name = None | |
exclude = True | |
if file_name=='-': | |
read_file = sys.stdin | |
elif file_name.endswith('.gz'): | |
read_file = gzip.open(file_name,'r') | |
else: | |
read_file = open(file_name, 'r') | |
prev_table_name, prev_table_lineno, prev_table_chars = None, -1, 0 | |
if listonly: print("%12s %s" % ('Bytes', 'Table')) | |
for lineno, line in enumerate(read_file): | |
prev_table_chars += len(line) | |
if is_create_statement(line): | |
current_table_name = get_mysql_name_value(line) | |
if listonly: | |
if prev_table_name: | |
print("%12d %s" % (prev_table_chars, prev_table_name) ) | |
prev_table_name, prev_table_lineno = current_table_name, lineno | |
prev_table_chars = 0 | |
continue | |
exclude = include and current_table_name not in include | |
if exclude: continue | |
SCHEMAS[current_table_name] = [] | |
print_verbose("Table %s: created" % current_table_name) | |
elif current_table_name and is_field_definition(line): | |
if listonly or exclude: continue | |
field_name = get_mysql_name_value(line) | |
SCHEMAS[current_table_name].append(field_name) | |
print_verbose("Table %s: Add column %s" % (current_table_name,field_name)) | |
elif is_insert_statement(line): | |
if listonly or exclude: continue | |
values = get_value_tuples(line) | |
# Add table value (and new table if needed) to data | |
data.add_table(current_table_name, SCHEMAS[current_table_name]) | |
data.add_values(values) | |
print_verbose("Table %s: Storing %d rows" % (current_table_name,len(values))) | |
if file_name!='-': read_file.close() | |
if listonly: | |
if prev_table_name: | |
print("%12d %s" % (prev_table_chars, prev_table_name) ) | |
return data | |
def Usage(): | |
print(""" | |
Usage: mysqldump_to_csv.py { - | DUMP_FILE } | |
Read the mysql dump file DUMP_FILE, and write each table as a separate CSV file in OUTPUT_DIR. | |
DUMP_FILE can be in *.gz format. | |
NOTE: For large mysql dump (1G or larger), extracting all tables could take 10 or 20 minutes. | |
It will be much quicker if you use the -i option to extract only the tables you want. | |
OPTIONS | |
-i NAME1[:NAME2[:NAME3]] Only include the following table names (use : as separator) | |
-l Just list the tables | |
-o OUTDIR Write each table into its own file in outdir, otherwise to stdout. | |
Directory OUTDIR will be created if it doesn't exist, | |
EXAMPLES | |
# List all tables in Mysql database dump file | |
$ mysqldump_to_csv.py -l 20200917-2300.slurmdb.sql | |
acct_coord_table | |
acct_table | |
.... | |
qos_table | |
res_table | |
user_table | |
# Extract specific table | |
# Note that | |
# - output directory /tmp/mysql will be created if it doesn't exist. | |
# - input MySQL dump file is in gzip format. | |
$ mysqldump_to_csv.py -i acct_table:cluster_assoc_table:qos_table:user_table -o /tmp/mysql database.gz | |
""") | |
sys.exit() | |
def print_verbose(msg): | |
global VERBOSE | |
if VERBOSE: | |
print(time.strftime(" %Y-%m-%d %H:%M:%S " + msg)) | |
def rename_keys(dictin,oldkeys,newkeys,oldprefix): | |
return dict( [(new,dictin[oldprefix+old]) for old,new in zip(oldkeys,newkeys) if oldprefix+old in dictin ] ) | |
def get_options(argsin): | |
opts, args = getopt.getopt(argsin,'vli:o:') | |
opts = dict(opts) | |
if '-l' in opts and len(args)!=1 or len(args)==2: Usage() | |
if '-i' in opts: | |
opts['-i'] = opts['-i'].split(":") | |
opts = rename_keys( opts, ('i','l','v','o'), ('include','list','verbose','outdir'), oldprefix='-' ) | |
return opts, args | |
#----------------------------------------------------------------------- | |
# Main | |
#----------------------------------------------------------------------- | |
if __name__ == '__main__': | |
opts, args = get_options(sys.argv[1:]) | |
if not args: Usage() | |
VERBOSE = 'verbose' in opts | |
# Create output directory if it does not exist | |
if 'outdir' in opts and not os.path.isdir(opts['outdir']): os.makedirs(opts['outdir']) | |
# Parse dump file and make CSV files | |
data = parse_dump_file(args[0], opts) | |
table_names = table_names = opts['input'].split(":") if 'input' in opts else data.get_table_names() | |
# Just list file (currently to this from parse_dump_file, for quicker response) | |
if 'list' in opts: | |
pass | |
for table_name in table_names: print(table_name) | |
# Output csv | |
else: | |
outdir = opts.get('outdir',None) | |
# Print individual files for each table | |
if outdir: | |
for table_name in table_names: | |
fname = "%s/%s.csv" % (outdir, table_name ) | |
with open(fname,"w") as fout: | |
data.print_table(table_name, file=fout) | |
# Print all tables to standard out | |
else: | |
for table_name in table_names: | |
data.print_table(table_name) |