init commit

zhs04001 · Sep 8, 2021 · 497211a · 497211a
commit 497211a
Show file tree

Hide file tree

Showing 2 changed files with 130 additions and 0 deletions.
diff --git a/cleanup-roster.py b/cleanup-roster.py
@@ -0,0 +1,98 @@
+# import system module
+import sys, re, argparse
+import csv
+
+class   Students:
+
+    def     __init__(self):
+        self.id = ""
+        self.name = "NameNotSet"
+        self.program = ""
+        self.netid = ""
+        self.email = ""
+        self.section = ""
+        self.fields = []
+
+    def     add_field(self, s):
+        # replace &amp; 
+        s = re.sub(r"&amp;", "&", s)
+        # remove br tag
+        s = re.sub(r"<br\s*/>", "", s)
+        self.fields.append(s) 
+        col = len(self.fields) - 1 
+        # print(col, s)
+        if col == 1:
+            self.id = s
+        elif col == 2:
+            self.name = s
+        elif col == 5:
+            self.program = s
+        elif col == 8:
+            self.netid = s
+        elif col == 10:
+            self.email = s
+        elif col == 13:
+            self.section = s
+
+    def     get_fields(self, fdno):
+        # could provide more options to get customized fields
+        if len(fdno) == 0: # only a short list
+            return (self.section, self.id, self.netid, self.name, self.email)
+        elif len(fdno) == 1 and fdno[0] == 'all':
+            return self.fields
+        else:
+            return [self.fields[int(_)] for _ in fdno]
+
+def     write_csv(file, student_list, fields): 
+    csvwriter = csv.writer(file)
+    for s in student_list: 
+        csvwriter.writerow(s.get_fields(fields))
+
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
+parser.add_argument('infile', nargs='?', help='Input file.')
+parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
+parser.add_argument('-f', nargs='+', default='', help='List of field numbers.')
+
+args = parser.parse_args()
+# print(args)
+
+student_list = []
+field_num = -1 
+student = None
+previous = ''
+
+with open(args.infile, 'r') as file:
+    num_lines = 0
+    for line in file:
+        num_lines += 1
+
+        if line.startswith('<tr>') or line.startswith('</table>'):
+            if student is not None:
+                student_list.append(student)
+                field_num = -1
+                student = None
+
+        # remove spaces at the end of the line, including newline
+        line = line.rstrip()
+
+        if line.startswith('<td'):
+            previous = ''
+        else:
+            line = previous + line
+
+        if not line.endswith('</td>'):
+            previous = line  
+            continue
+
+        # now we have a line that starts with "<td" and ends with "</td>"
+        m = re.search(r'^<td[^>]*>(.*)</td>', line)
+        if m:
+            if student is None:
+                student = Students()
+            student.add_field(m.group(1))
+
+if args.o != '':
+    with open(args.o, 'w', newline='') as csvfile:
+        write_csv(csvfile, student_list, args.f)
+else:
+    write_csv(sys.stdout, student_list, args.f)
diff --git a/readme.MD b/readme.MD
@@ -0,0 +1,32 @@
+## A Python script that cleans up rosters downloaded from HuskyCT 
+
+The downloaded file is actually an HTML file although Excel can 
+extract tables from it. The format may have changed recently. Now
+a student's info is on multiple rows. This script reads a downloaded
+file and writes selected columns into a CSV file.
+
+###  Usage
+
+Assume all files are under the current directory.  
+
+The following command prints info on the screen. 
+
+```
+python cleanup-roster.py downloaded.xls 
+```
+
+The `-o` option specifies an output file. 
+
+```
+python cleanup-roster.py downloaded.xls -o section1.csv
+```
+
+The `-f` option specifies the fields to be included. `all` 
+means all fields. Without `-f` option, only a few most useful
+fields are printed.
+
+```
+python cleanup-roster.py downloaded.xls -f 1 2
+python cleanup-roster.py downloaded.xls -f all
+```
+