From fa06a68270558b690b23691316ccfdd611b4f881 Mon Sep 17 00:00:00 2001 From: Jerry Shi Date: Sat, 23 Dec 2023 18:24:27 -0500 Subject: [PATCH] support html table in cleanup-roster.py --- cleanup-roster.py | 71 ++++++++++++++++++++++++++++++++++++++++------- readme.MD | 26 ++++++++++++----- 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/cleanup-roster.py b/cleanup-roster.py index 7607d81..73a8e04 100644 --- a/cleanup-roster.py +++ b/cleanup-roster.py @@ -1,7 +1,6 @@ #!/usr/bin/python3 import sys, argparse, fileinput, logging import csv, re -import openpyxl from enum import Enum, auto @@ -28,8 +27,9 @@ def is_hot_major(m, checksecond = False): def get_major_from_program(prog): # m = re.findall(r"(?: -|/)([^/]+)", prog) # majors = re.search("_x000D_(.+)", prog) - majors = re.search(r" -\s+(.+)", prog) + prog = re.sub(r"&", "&", prog) # print(prog) + majors = re.search(r" -\s*(.+)", prog) assert majors is not None # print(majors.group()) @@ -50,6 +50,7 @@ def get_major_from_program(prog): class State(Enum): START = auto() + HEADER = auto() ROW = auto() class Students: @@ -67,7 +68,7 @@ class Students: self.field_list = [] for cell in row: # short name is only the first word - shortname = cell.value.split(maxsplit=1)[0].lower() + shortname = cell.split(maxsplit=1)[0].lower() self.name_to_idx[shortname] = idx self.field_list.append(shortname) idx += 1 @@ -79,9 +80,9 @@ class Students: if len(row) != len(self.field_list): logging.warning("The number of fields does not seem correct: " - + ','.join(m)) - m = [ c.value for c in row ] - self.student_list.append(m) + + ','.join(row)) + self.student_list.append(row) + # print(len(self.student_list), row) def get_field_list(self): return self.field_list @@ -134,24 +135,74 @@ def write_csv(file, students, nl = None): for s in students: csvwriter.writerow(s) -def load_file(file, students): +# load from real xlsx files. +def load_file_xlsx(file, students): + import openpyxl try: wb = openpyxl.load_workbook(file, data_only=True) worksheet = wb.worksheets[0] first = True for row in worksheet.rows: + m = [ c.value for c in row ] if first: - students.add_header(row) + students.add_header(m) first = False else: - students.add(row) + students.add(m) except Exception as e: print(e) return False return True +### Helper functions +def is_tr(s): + return s.startswith('') + +# load from HTML file +def load_html_file(file, students): + + state = State.START + try: + with open(file, 'r', encoding='utf-8') as infile: + for line in infile: + + if state == State.START: + if is_tr(line): + row = "" + state = State.HEADER + + elif state == State.HEADER: + if is_tr(line): + # header + m = re.findall(r']*>(.*?)', row, re.IGNORECASE) + students.add_header(m) + + row = "" + state = State.ROW + else: + # remove spaces at the end of the line, including newline + row += line.rstrip() + + else: #State.ROW + if is_end_of_row(line): + m = re.findall(r']*>(.*?)', row, re.IGNORECASE) + students.add(m) + logging.debug(m) + + row = "" + else: + row += line.rstrip() + + except FileNotFoundError as e: + print(e) + return False + return True + parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.') parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.') parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') @@ -173,7 +224,7 @@ students = Students() for file in args.infiles: logging.info(f"Loading from {file}...") - if not load_file(file, students): + if not load_html_file(file, students): exit(1) if args.listfields: diff --git a/readme.MD b/readme.MD index fda41e3..8482d33 100644 --- a/readme.MD +++ b/readme.MD @@ -1,9 +1,8 @@ ## A Python script that cleans up rosters downloaded from HuskyCT -The downloaded file is actually an HTML file although Excel can -extract tables from it. The format may have changed recently. Now -a student's info is on multiple rows. This script reads a downloaded -file and writes selected columns into a CSV file. +This script reads downloaded roster files and writes selected columns/fields +into a CSV file. If the output filename is not specified, the lines are +printed to the standard output, which can be redirected into a file. ### Usage @@ -40,15 +39,28 @@ python cleanup-roster.py downloaded.xls --fields all In Powershell, the following commands process multiple xls files. ``` -# appends all student rows to all.csv -cat *.xls | py cleanup-roster.py -o all.csv +# write all students in multiple files to single output file all.csv +py cleanup-roster.py $(dir *.xls) -o all.csv # save student records in separate csv files -dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') } +dir *.xls | foreach-object { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') } ``` ## Notes +### Dec 2023 + +The downloaded files are actually HTML files and information is in an HTML +table. Excel can open it, after a warning. The format of the file changed +several times. Currently, the "Program and Plan" field is in a single table +cell. For example, + + Engineering - + ^MComputer Science + +Change the script to read downloaded file directly, instead of reading XLSX +files. + ### Nov 2022 The format of downloaded files keeps changing, mainly in "Program and Plan" field.