diff --git a/cleanup-roster.py b/cleanup-roster.py
index 7607d81..73a8e04 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -1,7 +1,6 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
-import openpyxl
from enum import Enum, auto
@@ -28,8 +27,9 @@ def is_hot_major(m, checksecond = False):
def get_major_from_program(prog):
# m = re.findall(r"(?: -|/)([^/]+)", prog)
# majors = re.search("_x000D_(.+)", prog)
- majors = re.search(r" -\s+(.+)", prog)
+ prog = re.sub(r"&", "&", prog)
# print(prog)
+ majors = re.search(r" -\s*(.+)", prog)
assert majors is not None
# print(majors.group())
@@ -50,6 +50,7 @@ def get_major_from_program(prog):
class State(Enum):
START = auto()
+ HEADER = auto()
ROW = auto()
class Students:
@@ -67,7 +68,7 @@ class Students:
self.field_list = []
for cell in row:
# short name is only the first word
- shortname = cell.value.split(maxsplit=1)[0].lower()
+ shortname = cell.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
@@ -79,9 +80,9 @@ class Students:
if len(row) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
- + ','.join(m))
- m = [ c.value for c in row ]
- self.student_list.append(m)
+ + ','.join(row))
+ self.student_list.append(row)
+ # print(len(self.student_list), row)
def get_field_list(self):
return self.field_list
@@ -134,24 +135,74 @@ def write_csv(file, students, nl = None):
for s in students:
csvwriter.writerow(s)
-def load_file(file, students):
+# load from real xlsx files.
+def load_file_xlsx(file, students):
+ import openpyxl
try:
wb = openpyxl.load_workbook(file, data_only=True)
worksheet = wb.worksheets[0]
first = True
for row in worksheet.rows:
+ m = [ c.value for c in row ]
if first:
- students.add_header(row)
+ students.add_header(m)
first = False
else:
- students.add(row)
+ students.add(m)
except Exception as e:
print(e)
return False
return True
+### Helper functions
+def is_tr(s):
+ return s.startswith('
')
+
+# load from HTML file
+def load_html_file(file, students):
+
+ state = State.START
+ try:
+ with open(file, 'r', encoding='utf-8') as infile:
+ for line in infile:
+
+ if state == State.START:
+ if is_tr(line):
+ row = ""
+ state = State.HEADER
+
+ elif state == State.HEADER:
+ if is_tr(line):
+ # header
+ m = re.findall(r']*>(.*?) | ', row, re.IGNORECASE)
+ students.add_header(m)
+
+ row = ""
+ state = State.ROW
+ else:
+ # remove spaces at the end of the line, including newline
+ row += line.rstrip()
+
+ else: #State.ROW
+ if is_end_of_row(line):
+ m = re.findall(r']*>(.*?) | ', row, re.IGNORECASE)
+ students.add(m)
+ logging.debug(m)
+
+ row = ""
+ else:
+ row += line.rstrip()
+
+ except FileNotFoundError as e:
+ print(e)
+ return False
+ return True
+
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
@@ -173,7 +224,7 @@ students = Students()
for file in args.infiles:
logging.info(f"Loading from {file}...")
- if not load_file(file, students):
+ if not load_html_file(file, students):
exit(1)
if args.listfields:
diff --git a/readme.MD b/readme.MD
index fda41e3..8482d33 100644
--- a/readme.MD
+++ b/readme.MD
@@ -1,9 +1,8 @@
## A Python script that cleans up rosters downloaded from HuskyCT
-The downloaded file is actually an HTML file although Excel can
-extract tables from it. The format may have changed recently. Now
-a student's info is on multiple rows. This script reads a downloaded
-file and writes selected columns into a CSV file.
+This script reads downloaded roster files and writes selected columns/fields
+into a CSV file. If the output filename is not specified, the lines are
+printed to the standard output, which can be redirected into a file.
### Usage
@@ -40,15 +39,28 @@ python cleanup-roster.py downloaded.xls --fields all
In Powershell, the following commands process multiple xls files.
```
-# appends all student rows to all.csv
-cat *.xls | py cleanup-roster.py -o all.csv
+# write all students in multiple files to single output file all.csv
+py cleanup-roster.py $(dir *.xls) -o all.csv
# save student records in separate csv files
-dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
+dir *.xls | foreach-object { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
```
## Notes
+### Dec 2023
+
+The downloaded files are actually HTML files and information is in an HTML
+table. Excel can open it, after a warning. The format of the file changed
+several times. Currently, the "Program and Plan" field is in a single table
+cell. For example,
+
+ Engineering -
+ ^MComputer Science |
+
+Change the script to read downloaded file directly, instead of reading XLSX
+files.
+
### Nov 2022
The format of downloaded files keeps changing, mainly in "Program and Plan" field.