refactor code. add major info

zhs04001 · Nov 16, 2022 · c863cf9 · c863cf9
1 parent 603d406
commit c863cf9
Show file tree

Hide file tree

Showing 2 changed files with 193 additions and 99 deletions.
diff --git a/cleanup-roster.py b/cleanup-roster.py
@@ -1,117 +1,187 @@
-import sys, re, argparse
-import csv
+#!/usr/bin/python3
+import sys, argparse, fileinput, logging
+import csv, re
+from enum import Enum, auto
+
+def     is_hot_major(m, checksecond = False):
+    if checksecond:
+        # some records are appended " as Second" and some " Second"
+        if not m.endswith("Second"):
+            return ""
+
+    if m.startswith("Computer Science & Engineering"):
+        major = "CSE"
+    elif m.startswith("Computer Science"):
+        major = "CS"
+    elif m.startswith("Computer Engineering"):
+        major = "CompE"
+    elif m.startswith("Electrical Engineering"):
+        major = "EE"
+    else:
+        if m.startswith("Computer"):
+            logging.warning(f"Program '{m}' starts with Computer.")
+        major = ""
+    return major
+
+def     get_major_from_program(prog):
+    m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+    assert len(m) > 0
+
+    major = is_hot_major(m[0])
+
+    if major == "":
+        if len(m) > 1:
+            for p in m[1:]:
+                major = is_hot_major(p, True)
+                if major:
+                    break
+    if major == "":
+        major = m[0]
+    logging.debug(f"{m} ==> {major}")
+    return major
 
-class   Student:
+class State(Enum):
+    START = auto()
+    ROW = auto()
+
+class   Students:
 
     def     __init__(self):
-        self.id = ""
-        self.name = "NameNotSet"
-        self.program = ""
-        self.netid = ""
-        self.email = ""
-        self.section = ""
-        self.fields = []
-
-    def     add_field(self, s):
+        self.name_to_idx = {}
+        self.student_list = []
+        self.field_list = []
+        self.output_fields = ["section", "id", "netid", "name", "email"]
+
+    def     add_header(self, line):
+        if len(self.name_to_idx):
+            return
+        m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
+        idx = 0
+        self.field_list = []
+        for f in m:
+            shortname = f.split(maxsplit=1)[0].lower()
+            self.name_to_idx[shortname] = idx 
+            self.field_list.append(shortname)
+            idx += 1
+        logging.info("Field names are: " + ','.join(self.field_list))
+
+    def     add(self, line):
         # replace &amp; 
-        s = re.sub(r"&amp;", "&", s)
+        logging.debug(line)
+        line1 = re.sub(r"&amp;", "&", line)
         # remove br tag
-        s = re.sub(r"<br\s*/>", "", s)
-        self.fields.append(s) 
-        col = len(self.fields) - 1 
-        # print(col, s)
-        if col == 1:
-            self.id = s
-        elif col == 2:
-            self.name = s
-        elif col == 5:
-            self.program = s
-        elif col == 8:
-            self.netid = s
-        elif col == 10:
-            self.email = s
-        elif col == 13:
-            self.section = s
-
-    def     get_fields(self, fdno):
-        # could provide more options to get customized fields
-        if len(fdno) == 0: # only a short list
-            return (self.section, self.id, self.netid, self.name, self.email)
-        elif len(fdno) == 1 and fdno[0] == 'all':
-            return self.fields
+        # s = re.sub(r"<br\s*/>", "", s)
+        m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
+        if m:
+            if len(m) != len(self.field_list): 
+                logging.warning("The number of fields does not seem correct: "
+                                + ','.join(m)) 
+            self.student_list.append(m)
         else:
-            return [self.fields[int(_)] for _ in fdno]
-
-def     write_csv(file, student_list, fields, nl = None): 
+            self.add_header(line1)
+
+    def     __iter__(self):
+        self.idx = 0
+        return self
+
+    def     __next__(self):
+        if self.idx >= len(self.student_list):
+            raise StopIteration
+        # print(self.idx, self.student_list[self.idx])
+        record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
+        self.idx += 1
+        return record
+
+    def     __str__(self):
+        # show all fields
+        return '\n'.join([ str(s) for s in self.student_list])
+
+    def     set_major(self):
+        idx = self.name_to_idx['program'] 
+        self.name_to_idx['major'] = len(self.field_list)
+        self.field_list.append('major')
+        for s in self.student_list:
+            s.append(get_major_from_program(s[idx]))
+
+    def     set_fields(self, fsel):
+        if len(fsel) == 0:
+            return
+
+        # output all fields
+        if len(fsel) == 1 and fsel[0] == 'all':
+            self.output_fields = list(self.field_list)
+            return
+
+        self.output_fields = []
+        self.add_fields(fsel)
+
+    def     add_fields(self, fields):
+        for f in [ f.lower() for f in fields]:
+            assert f in self.field_list
+            self.output_fields.append(f)
+
+def     write_csv(file, students, nl = None): 
     if nl is None:
         csvwriter = csv.writer(file)
     else:
         csvwriter = csv.writer(file, lineterminator=nl)
-    for s in student_list: 
-        csvwriter.writerow(s.get_fields(fields))
+    for s in students: 
+        csvwriter.writerow(s)
 
-parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
-parser.add_argument('infile', help='Input file.')
-parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
-parser.add_argument('-f', nargs='+', default='', help='List of field numbers.')
-parser.add_argument("-a", action='store_true', default=False, help='Append to the output file.')
-parser.add_argument("-v", action='store_true', default=False, help='Verbose.')
+def     is_tr(s):
+    return s.startswith('<tr')
 
-args = parser.parse_args()
-if args.v:
-    print(args)
+def     is_end_of_row(s):
+    return s.startswith('<tr>') or s.startswith('</table>')
 
-student_list = []
-student = None
-previous = ''
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
+parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
+parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
+parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
+parser.add_argument("-v", action='count', default=0, help='Verbose level.')
 
-try:
-    with open(args.infile, 'r') as file:
-        num_lines = 0
-        for line in file:
-            num_lines += 1
-
-            if line.startswith('<tr>') or line.startswith('</table>'):
-                if student is not None:
-                    student_list.append(student)
-                    student = None
-                previous = ''
-                continue
+args = parser.parse_args()
 
-            # remove spaces at the end of the line, including newline
-            line = line.rstrip()
+if args.v == 1:
+    logging.basicConfig(level=logging.INFO)
+elif args.v > 1:
+    logging.basicConfig(level=logging.DEBUG)
+
+logging.debug(args)
 
-            # append to previous line, if there is any
-            if previous: 
-                line = previous + line
+students = Students()
 
-            if not line.startswith("<td"):
-                # ignore lines that do not indicate a new field
-                assert(previous == '')
+state = State.START
+row = ''
+try:
+    for line in fileinput.input(args.infiles):
+        # remove spaces at the end of the line, including newline
+        line = line.rstrip()
+
+        if state == State.ROW:
+            if is_end_of_row(line):
+                students.add(row)
+                state = State.START
+            else:
+                row += line
                 continue
 
-            # print(line)
-
-            # now we check if the line starts with "<td" and has a "</td>"
-            m = re.match(r'<td[^>]*>(.*)</td>', line)
-            if m:
-                if student is None:
-                    student = Student()
-                student.add_field(m.group(1))
-                previous = ''
-            else:
-                # if the line does not have the ending tag, more lines are needed
-                previous = line
+        # looking for a row
+        if is_tr(line):
+            row = line
+            state = State.ROW
 
 except FileNotFoundError as e:
-    print(e)
     exit(1)
 
+students.set_major()
+students.set_fields(args.fields)
+students.add_fields(args.f)
+
 if args.o != '':
-    flag = 'w'
-    if args.a:
-        flag = 'a+'
-    with open(args.o, flag, newline='') as csvfile:
-        write_csv(csvfile, student_list, args.f)
+    with open(args.o, 'w', newline='') as csvfile:
+        write_csv(csvfile, students)
 else:
-    write_csv(sys.stdout, student_list, args.f, '\n')
+    write_csv(sys.stdout, students, '\n')
diff --git a/readme.MD b/readme.MD
@@ -21,23 +21,47 @@ The `-o` option specifies an output file.
 python cleanup-roster.py downloaded.xls -o section1.csv
 ```
 
-The `-a` option specifies the append mode so the output file is not cleared.
+Multiple `xls` files can be specified.
 
-The `-f` option specifies the fields to be included. `all` 
-means all fields. Without `-f` option, only a few most useful
-fields are printed.
+The script does not output all fields. The `-f` option specifies the additonal
+fields, one or more, to be included. For example, the following command adds
+enrollment date and program fields to the output.
+
+    python cleanup-roster.py downloaded.xls -f enrollment program
+
+`--fields` option specifies a full list of fields to be printed. `all` 
+means all fields. 
 
 ```
-python cleanup-roster.py downloaded.xls -f 1 2
-python cleanup-roster.py downloaded.xls -f all
+python cleanup-roster.py downloaded.xls --fields name netid
+python cleanup-roster.py downloaded.xls --fields all
 ```
 
 In Powershell, the following commands process multiple xls files.
 
 ```
 # appends all student rows to all.csv
-dir *.xls | foreach { py cleanup-roster.py $_ -a -o all.csv }
+cat *.xls | py cleanup-roster.py -o all.csv
 
 # save student records in separate csv files
 dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
 ```
+
+## Notes
+
+### Nov 2022
+
+The format of downloaded files keeps changing, mainly in "Program and Plan" field. 
+The format in Nov 2022 is:
+
+    <school> - <\r><major>
+    <school> - <\r><major>/<minor>
+    <school> - <\r><major>/<minor1/>/<minor2>
+    <school> - <\r><major>/<major2> as Second
+    <school> - <\r><major>/<major2> Second
+
+The school can be :
+
+    Engineering
+    Liberal Arts & Sciences
+    CCS Non-Degree