From fa06a68270558b690b23691316ccfdd611b4f881 Mon Sep 17 00:00:00 2001
From: Jerry Shi <zhijieshi@gmail.com>
Date: Sat, 23 Dec 2023 18:24:27 -0500
Subject: [PATCH] support html table in cleanup-roster.py

---
 cleanup-roster.py | 71 ++++++++++++++++++++++++++++++++++++++++-------
 readme.MD         | 26 ++++++++++++-----
 2 files changed, 80 insertions(+), 17 deletions(-)
diff --git a/cleanup-roster.py b/cleanup-roster.py
index 7607d81..73a8e04 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -1,7 +1,6 @@
 #!/usr/bin/python3
 import sys, argparse, fileinput, logging
 import csv, re
-import openpyxl
 
 from enum import Enum, auto
 
@@ -28,8 +27,9 @@ def     is_hot_major(m, checksecond = False):
 def     get_major_from_program(prog):
     # m = re.findall(r"(?: -|/)([^/]+)", prog)
     # majors = re.search("_x000D_(.+)", prog)
-    majors = re.search(r" -\s+(.+)", prog)
+    prog = re.sub(r"&amp;", "&", prog)
     # print(prog)
+    majors = re.search(r" -\s*(.+)", prog)
     assert  majors is not None
     # print(majors.group())
 
@@ -50,6 +50,7 @@ def     get_major_from_program(prog):
 
 class State(Enum):
     START = auto()
+    HEADER = auto()
     ROW = auto()
 
 class   Students:
@@ -67,7 +68,7 @@ class   Students:
         self.field_list = []
         for cell in row:
             # short name is only the first word
-            shortname = cell.value.split(maxsplit=1)[0].lower()
+            shortname = cell.split(maxsplit=1)[0].lower()
             self.name_to_idx[shortname] = idx 
             self.field_list.append(shortname)
             idx += 1
@@ -79,9 +80,9 @@ class   Students:
 
         if len(row) != len(self.field_list): 
             logging.warning("The number of fields does not seem correct: "
-                                + ','.join(m)) 
-        m = [ c.value for c in row ]
-        self.student_list.append(m)
+                                + ','.join(row)) 
+        self.student_list.append(row)
+        # print(len(self.student_list), row)
 
     def     get_field_list(self):
         return self.field_list
@@ -134,24 +135,74 @@ def     write_csv(file, students, nl = None):
     for s in students: 
         csvwriter.writerow(s)
 
-def load_file(file, students):
+# load from real xlsx files.
+def load_file_xlsx(file, students):
+    import openpyxl
     try:
         wb = openpyxl.load_workbook(file, data_only=True)
         worksheet = wb.worksheets[0]
 
         first = True
         for row in worksheet.rows:
+            m = [ c.value for c in row ]
             if first: 
-                students.add_header(row)
+                students.add_header(m)
                 first = False
             else:
-                students.add(row)
+                students.add(m)
 
     except Exception as e:
         print(e)
         return False
     return True
 
+### Helper functions
+def     is_tr(s):
+    return s.startswith('<tr')
+
+def     is_end_of_row(s):
+    return s.startswith('<tr') or s.startswith('</table>')
+
+# load from HTML file
+def load_html_file(file, students):
+
+    state = State.START
+    try:
+        with open(file, 'r', encoding='utf-8') as infile:
+            for line in infile:
+
+                if state == State.START:
+                    if is_tr(line):
+                        row = "" 
+                        state = State.HEADER
+
+                elif state == State.HEADER:
+                    if is_tr(line):
+                        # header
+                        m = re.findall(r'<th[^>]*>(.*?)</th>', row, re.IGNORECASE)
+                        students.add_header(m)
+
+                        row = ""
+                        state = State.ROW
+                    else:
+                        # remove spaces at the end of the line, including newline
+                        row += line.rstrip()
+
+                else:   #State.ROW
+                    if is_end_of_row(line):
+                        m = re.findall(r'<td[^>]*>(.*?)</td>', row, re.IGNORECASE)
+                        students.add(m)
+                        logging.debug(m)
+
+                        row = ""
+                    else:
+                        row += line.rstrip()
+
+    except FileNotFoundError as e:
+        print(e)
+        return False
+    return True
+
 parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
 parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
 parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
@@ -173,7 +224,7 @@ students = Students()
 
 for file in args.infiles:
     logging.info(f"Loading from {file}...")
-    if not load_file(file, students):
+    if not load_html_file(file, students):
         exit(1)
 
 if args.listfields:
diff --git a/readme.MD b/readme.MD
index fda41e3..8482d33 100644
--- a/readme.MD
+++ b/readme.MD
@@ -1,9 +1,8 @@
 ## A Python script that cleans up rosters downloaded from HuskyCT 
 
-The downloaded file is actually an HTML file although Excel can 
-extract tables from it. The format may have changed recently. Now
-a student's info is on multiple rows. This script reads a downloaded
-file and writes selected columns into a CSV file.
+This script reads downloaded roster files and writes selected columns/fields
+into a CSV file. If the output filename is not specified, the lines are
+printed to the standard output, which can be redirected into a file.
 
 ###  Usage
 
@@ -40,15 +39,28 @@ python cleanup-roster.py downloaded.xls --fields all
 In Powershell, the following commands process multiple xls files.
 
 ```
-# appends all student rows to all.csv
-cat *.xls | py cleanup-roster.py -o all.csv
+# write all students in multiple files to single output file all.csv
+py cleanup-roster.py $(dir *.xls) -o all.csv
 
 # save student records in separate csv files
-dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
+dir *.xls | foreach-object { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
 ```
 
 ## Notes
 
+### Dec 2023
+
+The downloaded files are actually HTML files and information is in an HTML
+table. Excel can open it, after a warning. The format of the file changed
+several times. Currently, the "Program and Plan" field is in a single table
+cell. For example, 
+
+    <td  >Engineering -
+    ^MComputer Science</td>
+
+Change the script to read downloaded file directly, instead of reading XLSX
+files.
+
 ### Nov 2022
 
 The format of downloaded files keeps changing, mainly in "Program and Plan" field.