diff --git a/cleanup-roster.py b/cleanup-roster.py
index f85b5e5..0623a40 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -1,117 +1,187 @@
-import sys, re, argparse
-import csv
+#!/usr/bin/python3
+import sys, argparse, fileinput, logging
+import csv, re
+from enum import Enum, auto
+
+def is_hot_major(m, checksecond = False):
+ if checksecond:
+ # some records are appended " as Second" and some " Second"
+ if not m.endswith("Second"):
+ return ""
+
+ if m.startswith("Computer Science & Engineering"):
+ major = "CSE"
+ elif m.startswith("Computer Science"):
+ major = "CS"
+ elif m.startswith("Computer Engineering"):
+ major = "CompE"
+ elif m.startswith("Electrical Engineering"):
+ major = "EE"
+ else:
+ if m.startswith("Computer"):
+ logging.warning(f"Program '{m}' starts with Computer.")
+ major = ""
+ return major
+
+def get_major_from_program(prog):
+ m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+ assert len(m) > 0
+
+ major = is_hot_major(m[0])
+
+ if major == "":
+ if len(m) > 1:
+ for p in m[1:]:
+ major = is_hot_major(p, True)
+ if major:
+ break
+ if major == "":
+ major = m[0]
+ logging.debug(f"{m} ==> {major}")
+ return major
-class Student:
+class State(Enum):
+ START = auto()
+ ROW = auto()
+
+class Students:
def __init__(self):
- self.id = ""
- self.name = "NameNotSet"
- self.program = ""
- self.netid = ""
- self.email = ""
- self.section = ""
- self.fields = []
-
- def add_field(self, s):
+ self.name_to_idx = {}
+ self.student_list = []
+ self.field_list = []
+ self.output_fields = ["section", "id", "netid", "name", "email"]
+
+ def add_header(self, line):
+ if len(self.name_to_idx):
+ return
+ m = re.findall(r'
]*>(.*?) | ', line, re.IGNORECASE)
+ idx = 0
+ self.field_list = []
+ for f in m:
+ shortname = f.split(maxsplit=1)[0].lower()
+ self.name_to_idx[shortname] = idx
+ self.field_list.append(shortname)
+ idx += 1
+ logging.info("Field names are: " + ','.join(self.field_list))
+
+ def add(self, line):
# replace &
- s = re.sub(r"&", "&", s)
+ logging.debug(line)
+ line1 = re.sub(r"&", "&", line)
# remove br tag
- s = re.sub(r"
", "", s)
- self.fields.append(s)
- col = len(self.fields) - 1
- # print(col, s)
- if col == 1:
- self.id = s
- elif col == 2:
- self.name = s
- elif col == 5:
- self.program = s
- elif col == 8:
- self.netid = s
- elif col == 10:
- self.email = s
- elif col == 13:
- self.section = s
-
- def get_fields(self, fdno):
- # could provide more options to get customized fields
- if len(fdno) == 0: # only a short list
- return (self.section, self.id, self.netid, self.name, self.email)
- elif len(fdno) == 1 and fdno[0] == 'all':
- return self.fields
+ # s = re.sub(r"
", "", s)
+ m = re.findall(r']*>(.*?) | ', line1, re.IGNORECASE)
+ if m:
+ if len(m) != len(self.field_list):
+ logging.warning("The number of fields does not seem correct: "
+ + ','.join(m))
+ self.student_list.append(m)
else:
- return [self.fields[int(_)] for _ in fdno]
-
-def write_csv(file, student_list, fields, nl = None):
+ self.add_header(line1)
+
+ def __iter__(self):
+ self.idx = 0
+ return self
+
+ def __next__(self):
+ if self.idx >= len(self.student_list):
+ raise StopIteration
+ # print(self.idx, self.student_list[self.idx])
+ record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
+ self.idx += 1
+ return record
+
+ def __str__(self):
+ # show all fields
+ return '\n'.join([ str(s) for s in self.student_list])
+
+ def set_major(self):
+ idx = self.name_to_idx['program']
+ self.name_to_idx['major'] = len(self.field_list)
+ self.field_list.append('major')
+ for s in self.student_list:
+ s.append(get_major_from_program(s[idx]))
+
+ def set_fields(self, fsel):
+ if len(fsel) == 0:
+ return
+
+ # output all fields
+ if len(fsel) == 1 and fsel[0] == 'all':
+ self.output_fields = list(self.field_list)
+ return
+
+ self.output_fields = []
+ self.add_fields(fsel)
+
+ def add_fields(self, fields):
+ for f in [ f.lower() for f in fields]:
+ assert f in self.field_list
+ self.output_fields.append(f)
+
+def write_csv(file, students, nl = None):
if nl is None:
csvwriter = csv.writer(file)
else:
csvwriter = csv.writer(file, lineterminator=nl)
- for s in student_list:
- csvwriter.writerow(s.get_fields(fields))
+ for s in students:
+ csvwriter.writerow(s)
-parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
-parser.add_argument('infile', help='Input file.')
-parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
-parser.add_argument('-f', nargs='+', default='', help='List of field numbers.')
-parser.add_argument("-a", action='store_true', default=False, help='Append to the output file.')
-parser.add_argument("-v", action='store_true', default=False, help='Verbose.')
+def is_tr(s):
+ return s.startswith('') or s.startswith('')
-student_list = []
-student = None
-previous = ''
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
+parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
+parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
+parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
+parser.add_argument("-v", action='count', default=0, help='Verbose level.')
-try:
- with open(args.infile, 'r') as file:
- num_lines = 0
- for line in file:
- num_lines += 1
-
- if line.startswith('
') or line.startswith(''):
- if student is not None:
- student_list.append(student)
- student = None
- previous = ''
- continue
+args = parser.parse_args()
- # remove spaces at the end of the line, including newline
- line = line.rstrip()
+if args.v == 1:
+ logging.basicConfig(level=logging.INFO)
+elif args.v > 1:
+ logging.basicConfig(level=logging.DEBUG)
+
+logging.debug(args)
- # append to previous line, if there is any
- if previous:
- line = previous + line
+students = Students()
- if not line.startswith(""
- m = re.match(r' | ]*>(.*) | ', line)
- if m:
- if student is None:
- student = Student()
- student.add_field(m.group(1))
- previous = ''
- else:
- # if the line does not have the ending tag, more lines are needed
- previous = line
+ # looking for a row
+ if is_tr(line):
+ row = line
+ state = State.ROW
except FileNotFoundError as e:
- print(e)
exit(1)
+students.set_major()
+students.set_fields(args.fields)
+students.add_fields(args.f)
+
if args.o != '':
- flag = 'w'
- if args.a:
- flag = 'a+'
- with open(args.o, flag, newline='') as csvfile:
- write_csv(csvfile, student_list, args.f)
+ with open(args.o, 'w', newline='') as csvfile:
+ write_csv(csvfile, students)
else:
- write_csv(sys.stdout, student_list, args.f, '\n')
+ write_csv(sys.stdout, students, '\n')
diff --git a/readme.MD b/readme.MD
index bf3a041..fda41e3 100644
--- a/readme.MD
+++ b/readme.MD
@@ -21,23 +21,47 @@ The `-o` option specifies an output file.
python cleanup-roster.py downloaded.xls -o section1.csv
```
-The `-a` option specifies the append mode so the output file is not cleared.
+Multiple `xls` files can be specified.
-The `-f` option specifies the fields to be included. `all`
-means all fields. Without `-f` option, only a few most useful
-fields are printed.
+The script does not output all fields. The `-f` option specifies the additonal
+fields, one or more, to be included. For example, the following command adds
+enrollment date and program fields to the output.
+
+ python cleanup-roster.py downloaded.xls -f enrollment program
+
+`--fields` option specifies a full list of fields to be printed. `all`
+means all fields.
```
-python cleanup-roster.py downloaded.xls -f 1 2
-python cleanup-roster.py downloaded.xls -f all
+python cleanup-roster.py downloaded.xls --fields name netid
+python cleanup-roster.py downloaded.xls --fields all
```
In Powershell, the following commands process multiple xls files.
```
# appends all student rows to all.csv
-dir *.xls | foreach { py cleanup-roster.py $_ -a -o all.csv }
+cat *.xls | py cleanup-roster.py -o all.csv
# save student records in separate csv files
dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
```
+
+## Notes
+
+### Nov 2022
+
+The format of downloaded files keeps changing, mainly in "Program and Plan" field.
+The format in Nov 2022 is:
+
+ - <\r>
+ - <\r>/
+ - <\r>//
+ - <\r>/ as Second
+ - <\r>/ Second
+
+The school can be :
+
+ Engineering
+ Liberal Arts & Sciences
+ CCS Non-Degree