diff --git a/cleanup-roster.py b/cleanup-roster.py index f85b5e5..0623a40 100644 --- a/cleanup-roster.py +++ b/cleanup-roster.py @@ -1,117 +1,187 @@ -import sys, re, argparse -import csv +#!/usr/bin/python3 +import sys, argparse, fileinput, logging +import csv, re +from enum import Enum, auto + +def is_hot_major(m, checksecond = False): + if checksecond: + # some records are appended " as Second" and some " Second" + if not m.endswith("Second"): + return "" + + if m.startswith("Computer Science & Engineering"): + major = "CSE" + elif m.startswith("Computer Science"): + major = "CS" + elif m.startswith("Computer Engineering"): + major = "CompE" + elif m.startswith("Electrical Engineering"): + major = "EE" + else: + if m.startswith("Computer"): + logging.warning(f"Program '{m}' starts with Computer.") + major = "" + return major + +def get_major_from_program(prog): + m = re.findall(r"(?: -|/)([^/]+)", prog) + + assert len(m) > 0 + + major = is_hot_major(m[0]) + + if major == "": + if len(m) > 1: + for p in m[1:]: + major = is_hot_major(p, True) + if major: + break + if major == "": + major = m[0] + logging.debug(f"{m} ==> {major}") + return major -class Student: +class State(Enum): + START = auto() + ROW = auto() + +class Students: def __init__(self): - self.id = "" - self.name = "NameNotSet" - self.program = "" - self.netid = "" - self.email = "" - self.section = "" - self.fields = [] - - def add_field(self, s): + self.name_to_idx = {} + self.student_list = [] + self.field_list = [] + self.output_fields = ["section", "id", "netid", "name", "email"] + + def add_header(self, line): + if len(self.name_to_idx): + return + m = re.findall(r']*>(.*?)', line, re.IGNORECASE) + idx = 0 + self.field_list = [] + for f in m: + shortname = f.split(maxsplit=1)[0].lower() + self.name_to_idx[shortname] = idx + self.field_list.append(shortname) + idx += 1 + logging.info("Field names are: " + ','.join(self.field_list)) + + def add(self, line): # replace & - s = re.sub(r"&", "&", s) + logging.debug(line) + line1 = re.sub(r"&", "&", line) # remove br tag - s = re.sub(r"", "", s) - self.fields.append(s) - col = len(self.fields) - 1 - # print(col, s) - if col == 1: - self.id = s - elif col == 2: - self.name = s - elif col == 5: - self.program = s - elif col == 8: - self.netid = s - elif col == 10: - self.email = s - elif col == 13: - self.section = s - - def get_fields(self, fdno): - # could provide more options to get customized fields - if len(fdno) == 0: # only a short list - return (self.section, self.id, self.netid, self.name, self.email) - elif len(fdno) == 1 and fdno[0] == 'all': - return self.fields + # s = re.sub(r"", "", s) + m = re.findall(r']*>(.*?)', line1, re.IGNORECASE) + if m: + if len(m) != len(self.field_list): + logging.warning("The number of fields does not seem correct: " + + ','.join(m)) + self.student_list.append(m) else: - return [self.fields[int(_)] for _ in fdno] - -def write_csv(file, student_list, fields, nl = None): + self.add_header(line1) + + def __iter__(self): + self.idx = 0 + return self + + def __next__(self): + if self.idx >= len(self.student_list): + raise StopIteration + # print(self.idx, self.student_list[self.idx]) + record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ] + self.idx += 1 + return record + + def __str__(self): + # show all fields + return '\n'.join([ str(s) for s in self.student_list]) + + def set_major(self): + idx = self.name_to_idx['program'] + self.name_to_idx['major'] = len(self.field_list) + self.field_list.append('major') + for s in self.student_list: + s.append(get_major_from_program(s[idx])) + + def set_fields(self, fsel): + if len(fsel) == 0: + return + + # output all fields + if len(fsel) == 1 and fsel[0] == 'all': + self.output_fields = list(self.field_list) + return + + self.output_fields = [] + self.add_fields(fsel) + + def add_fields(self, fields): + for f in [ f.lower() for f in fields]: + assert f in self.field_list + self.output_fields.append(f) + +def write_csv(file, students, nl = None): if nl is None: csvwriter = csv.writer(file) else: csvwriter = csv.writer(file, lineterminator=nl) - for s in student_list: - csvwriter.writerow(s.get_fields(fields)) + for s in students: + csvwriter.writerow(s) -parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT') -parser.add_argument('infile', help='Input file.') -parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') -parser.add_argument('-f', nargs='+', default='', help='List of field numbers.') -parser.add_argument("-a", action='store_true', default=False, help='Append to the output file.') -parser.add_argument("-v", action='store_true', default=False, help='Verbose.') +def is_tr(s): + return s.startswith('') or s.startswith('') -student_list = [] -student = None -previous = '' +parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.') +parser.add_argument('infiles', nargs='*', default=[], help='Input files.') +parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') +parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list') +parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.') +parser.add_argument("-v", action='count', default=0, help='Verbose level.') -try: - with open(args.infile, 'r') as file: - num_lines = 0 - for line in file: - num_lines += 1 - - if line.startswith('') or line.startswith(''): - if student is not None: - student_list.append(student) - student = None - previous = '' - continue +args = parser.parse_args() - # remove spaces at the end of the line, including newline - line = line.rstrip() +if args.v == 1: + logging.basicConfig(level=logging.INFO) +elif args.v > 1: + logging.basicConfig(level=logging.DEBUG) + +logging.debug(args) - # append to previous line, if there is any - if previous: - line = previous + line +students = Students() - if not line.startswith("" - m = re.match(r']*>(.*)', line) - if m: - if student is None: - student = Student() - student.add_field(m.group(1)) - previous = '' - else: - # if the line does not have the ending tag, more lines are needed - previous = line + # looking for a row + if is_tr(line): + row = line + state = State.ROW except FileNotFoundError as e: - print(e) exit(1) +students.set_major() +students.set_fields(args.fields) +students.add_fields(args.f) + if args.o != '': - flag = 'w' - if args.a: - flag = 'a+' - with open(args.o, flag, newline='') as csvfile: - write_csv(csvfile, student_list, args.f) + with open(args.o, 'w', newline='') as csvfile: + write_csv(csvfile, students) else: - write_csv(sys.stdout, student_list, args.f, '\n') + write_csv(sys.stdout, students, '\n') diff --git a/readme.MD b/readme.MD index bf3a041..fda41e3 100644 --- a/readme.MD +++ b/readme.MD @@ -21,23 +21,47 @@ The `-o` option specifies an output file. python cleanup-roster.py downloaded.xls -o section1.csv ``` -The `-a` option specifies the append mode so the output file is not cleared. +Multiple `xls` files can be specified. -The `-f` option specifies the fields to be included. `all` -means all fields. Without `-f` option, only a few most useful -fields are printed. +The script does not output all fields. The `-f` option specifies the additonal +fields, one or more, to be included. For example, the following command adds +enrollment date and program fields to the output. + + python cleanup-roster.py downloaded.xls -f enrollment program + +`--fields` option specifies a full list of fields to be printed. `all` +means all fields. ``` -python cleanup-roster.py downloaded.xls -f 1 2 -python cleanup-roster.py downloaded.xls -f all +python cleanup-roster.py downloaded.xls --fields name netid +python cleanup-roster.py downloaded.xls --fields all ``` In Powershell, the following commands process multiple xls files. ``` # appends all student rows to all.csv -dir *.xls | foreach { py cleanup-roster.py $_ -a -o all.csv } +cat *.xls | py cleanup-roster.py -o all.csv # save student records in separate csv files dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') } ``` + +## Notes + +### Nov 2022 + +The format of downloaded files keeps changing, mainly in "Program and Plan" field. +The format in Nov 2022 is: + + - <\r> + - <\r>/ + - <\r>// + - <\r>/ as Second + - <\r>/ Second + +The school can be : + + Engineering + Liberal Arts & Sciences + CCS Non-Degree