From feb0e56c292c50bdfc59bbe3db07f66fcc05b837 Mon Sep 17 00:00:00 2001
From: Jerry Shi <jerryshict@outlook.com>
Date: Mon, 3 Apr 2023 15:04:25 -0400
Subject: [PATCH] update script to read new format

---
 cleanup-roster-old.py | 187 ++++++++++++++++++++++++++++++++++++++++++
 cleanup-roster.py     |  84 +++++++++----------
 2 files changed, 226 insertions(+), 45 deletions(-)
 create mode 100644 cleanup-roster-old.py
diff --git a/cleanup-roster-old.py b/cleanup-roster-old.py
new file mode 100644
index 0000000..0623a40
--- /dev/null
+++ b/cleanup-roster-old.py
@@ -0,0 +1,187 @@
+#!/usr/bin/python3
+import sys, argparse, fileinput, logging
+import csv, re
+from enum import Enum, auto
+
+def     is_hot_major(m, checksecond = False):
+    if checksecond:
+        # some records are appended " as Second" and some " Second"
+        if not m.endswith("Second"):
+            return ""
+
+    if m.startswith("Computer Science & Engineering"):
+        major = "CSE"
+    elif m.startswith("Computer Science"):
+        major = "CS"
+    elif m.startswith("Computer Engineering"):
+        major = "CompE"
+    elif m.startswith("Electrical Engineering"):
+        major = "EE"
+    else:
+        if m.startswith("Computer"):
+            logging.warning(f"Program '{m}' starts with Computer.")
+        major = ""
+    return major
+
+def     get_major_from_program(prog):
+    m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+    assert len(m) > 0
+
+    major = is_hot_major(m[0])
+
+    if major == "":
+        if len(m) > 1:
+            for p in m[1:]:
+                major = is_hot_major(p, True)
+                if major:
+                    break
+    if major == "":
+        major = m[0]
+    logging.debug(f"{m} ==> {major}")
+    return major
+
+class State(Enum):
+    START = auto()
+    ROW = auto()
+
+class   Students:
+
+    def     __init__(self):
+        self.name_to_idx = {}
+        self.student_list = []
+        self.field_list = []
+        self.output_fields = ["section", "id", "netid", "name", "email"]
+
+    def     add_header(self, line):
+        if len(self.name_to_idx):
+            return
+        m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
+        idx = 0
+        self.field_list = []
+        for f in m:
+            shortname = f.split(maxsplit=1)[0].lower()
+            self.name_to_idx[shortname] = idx 
+            self.field_list.append(shortname)
+            idx += 1
+        logging.info("Field names are: " + ','.join(self.field_list))
+
+    def     add(self, line):
+        # replace &amp; 
+        logging.debug(line)
+        line1 = re.sub(r"&amp;", "&", line)
+        # remove br tag
+        # s = re.sub(r"<br\s*/>", "", s)
+        m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
+        if m:
+            if len(m) != len(self.field_list): 
+                logging.warning("The number of fields does not seem correct: "
+                                + ','.join(m)) 
+            self.student_list.append(m)
+        else:
+            self.add_header(line1)
+
+    def     __iter__(self):
+        self.idx = 0
+        return self
+
+    def     __next__(self):
+        if self.idx >= len(self.student_list):
+            raise StopIteration
+        # print(self.idx, self.student_list[self.idx])
+        record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
+        self.idx += 1
+        return record
+
+    def     __str__(self):
+        # show all fields
+        return '\n'.join([ str(s) for s in self.student_list])
+
+    def     set_major(self):
+        idx = self.name_to_idx['program'] 
+        self.name_to_idx['major'] = len(self.field_list)
+        self.field_list.append('major')
+        for s in self.student_list:
+            s.append(get_major_from_program(s[idx]))
+
+    def     set_fields(self, fsel):
+        if len(fsel) == 0:
+            return
+
+        # output all fields
+        if len(fsel) == 1 and fsel[0] == 'all':
+            self.output_fields = list(self.field_list)
+            return
+
+        self.output_fields = []
+        self.add_fields(fsel)
+
+    def     add_fields(self, fields):
+        for f in [ f.lower() for f in fields]:
+            assert f in self.field_list
+            self.output_fields.append(f)
+
+def     write_csv(file, students, nl = None): 
+    if nl is None:
+        csvwriter = csv.writer(file)
+    else:
+        csvwriter = csv.writer(file, lineterminator=nl)
+    for s in students: 
+        csvwriter.writerow(s)
+
+def     is_tr(s):
+    return s.startswith('<tr')
+
+def     is_end_of_row(s):
+    return s.startswith('<tr>') or s.startswith('</table>')
+
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
+parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
+parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
+parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
+parser.add_argument("-v", action='count', default=0, help='Verbose level.')
+
+args = parser.parse_args()
+
+if args.v == 1:
+    logging.basicConfig(level=logging.INFO)
+elif args.v > 1:
+    logging.basicConfig(level=logging.DEBUG)
+    
+logging.debug(args)
+
+students = Students()
+
+state = State.START
+row = ''
+try:
+    for line in fileinput.input(args.infiles):
+        # remove spaces at the end of the line, including newline
+        line = line.rstrip()
+
+        if state == State.ROW:
+            if is_end_of_row(line):
+                students.add(row)
+                state = State.START
+            else:
+                row += line
+                continue
+
+        # looking for a row
+        if is_tr(line):
+            row = line
+            state = State.ROW
+
+except FileNotFoundError as e:
+    exit(1)
+
+students.set_major()
+students.set_fields(args.fields)
+students.add_fields(args.f)
+
+if args.o != '':
+    with open(args.o, 'w', newline='') as csvfile:
+        write_csv(csvfile, students)
+else:
+    write_csv(sys.stdout, students, '\n')
diff --git a/cleanup-roster.py b/cleanup-roster.py
index 0623a40..c2af6df 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -1,6 +1,8 @@
 #!/usr/bin/python3
 import sys, argparse, fileinput, logging
 import csv, re
+import openpyxl
+
 from enum import Enum, auto
 
 def     is_hot_major(m, checksecond = False):
@@ -24,9 +26,13 @@ def     is_hot_major(m, checksecond = False):
     return major
 
 def     get_major_from_program(prog):
-    m = re.findall(r"(?: -|/)([^/]+)", prog)
+    # m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+    majors = re.search("_x000D_(.+)", prog)
+    assert  majors is not None
+    # print(majors.group())
 
-    assert len(m) > 0
+    m = majors.group(1).split("/") 
 
     major = is_hot_major(m[0])
 
@@ -53,33 +59,28 @@ def     __init__(self):
         self.field_list = []
         self.output_fields = ["section", "id", "netid", "name", "email"]
 
-    def     add_header(self, line):
+    def     add_header(self, row):
         if len(self.name_to_idx):
             return
-        m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
         idx = 0
         self.field_list = []
-        for f in m:
-            shortname = f.split(maxsplit=1)[0].lower()
+        for cell in row:
+            # short name is only the first word
+            shortname = cell.value.split(maxsplit=1)[0].lower()
             self.name_to_idx[shortname] = idx 
             self.field_list.append(shortname)
             idx += 1
         logging.info("Field names are: " + ','.join(self.field_list))
 
-    def     add(self, line):
+    def     add(self, row):
         # replace &amp; 
-        logging.debug(line)
-        line1 = re.sub(r"&amp;", "&", line)
-        # remove br tag
-        # s = re.sub(r"<br\s*/>", "", s)
-        m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
-        if m:
-            if len(m) != len(self.field_list): 
-                logging.warning("The number of fields does not seem correct: "
+        # line1 = re.sub(r"&amp;", "&", line)
+
+        if len(row) != len(self.field_list): 
+            logging.warning("The number of fields does not seem correct: "
                                 + ','.join(m)) 
-            self.student_list.append(m)
-        else:
-            self.add_header(line1)
+        m = [ c.value for c in row ]
+        self.student_list.append(m)
 
     def     __iter__(self):
         self.idx = 0
@@ -129,13 +130,25 @@ def     write_csv(file, students, nl = None):
     for s in students: 
         csvwriter.writerow(s)
 
-def     is_tr(s):
-    return s.startswith('<tr')
+def load_file(file, students):
+    try:
+        wb = openpyxl.load_workbook(file, data_only=True)
+        worksheet = wb.worksheets[0]
+
+        first = True
+        for row in worksheet.rows:
+            if first: 
+                students.add_header(row)
+                first = False
+            else:
+                students.add(row)
 
-def     is_end_of_row(s):
-    return s.startswith('<tr>') or s.startswith('</table>')
+    except Exception as e:
+        print(e)
+        return False
+    return True
 
-parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
 parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
 parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
 parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
@@ -153,28 +166,9 @@ def     is_end_of_row(s):
 
 students = Students()
 
-state = State.START
-row = ''
-try:
-    for line in fileinput.input(args.infiles):
-        # remove spaces at the end of the line, including newline
-        line = line.rstrip()
-
-        if state == State.ROW:
-            if is_end_of_row(line):
-                students.add(row)
-                state = State.START
-            else:
-                row += line
-                continue
-
-        # looking for a row
-        if is_tr(line):
-            row = line
-            state = State.ROW
-
-except FileNotFoundError as e:
-    exit(1)
+for file in args.infiles:
+    logging.info(f"Loading from {file}...")
+    load_file(file, students)
 
 students.set_major()
 students.set_fields(args.fields)