From feb0e56c292c50bdfc59bbe3db07f66fcc05b837 Mon Sep 17 00:00:00 2001 From: Jerry Shi Date: Mon, 3 Apr 2023 15:04:25 -0400 Subject: [PATCH] update script to read new format --- cleanup-roster-old.py | 187 ++++++++++++++++++++++++++++++++++++++++++ cleanup-roster.py | 84 +++++++++---------- 2 files changed, 226 insertions(+), 45 deletions(-) create mode 100644 cleanup-roster-old.py diff --git a/cleanup-roster-old.py b/cleanup-roster-old.py new file mode 100644 index 0000000..0623a40 --- /dev/null +++ b/cleanup-roster-old.py @@ -0,0 +1,187 @@ +#!/usr/bin/python3 +import sys, argparse, fileinput, logging +import csv, re +from enum import Enum, auto + +def is_hot_major(m, checksecond = False): + if checksecond: + # some records are appended " as Second" and some " Second" + if not m.endswith("Second"): + return "" + + if m.startswith("Computer Science & Engineering"): + major = "CSE" + elif m.startswith("Computer Science"): + major = "CS" + elif m.startswith("Computer Engineering"): + major = "CompE" + elif m.startswith("Electrical Engineering"): + major = "EE" + else: + if m.startswith("Computer"): + logging.warning(f"Program '{m}' starts with Computer.") + major = "" + return major + +def get_major_from_program(prog): + m = re.findall(r"(?: -|/)([^/]+)", prog) + + assert len(m) > 0 + + major = is_hot_major(m[0]) + + if major == "": + if len(m) > 1: + for p in m[1:]: + major = is_hot_major(p, True) + if major: + break + if major == "": + major = m[0] + logging.debug(f"{m} ==> {major}") + return major + +class State(Enum): + START = auto() + ROW = auto() + +class Students: + + def __init__(self): + self.name_to_idx = {} + self.student_list = [] + self.field_list = [] + self.output_fields = ["section", "id", "netid", "name", "email"] + + def add_header(self, line): + if len(self.name_to_idx): + return + m = re.findall(r']*>(.*?)', line, re.IGNORECASE) + idx = 0 + self.field_list = [] + for f in m: + shortname = f.split(maxsplit=1)[0].lower() + self.name_to_idx[shortname] = idx + self.field_list.append(shortname) + idx += 1 + logging.info("Field names are: " + ','.join(self.field_list)) + + def add(self, line): + # replace & + logging.debug(line) + line1 = re.sub(r"&", "&", line) + # remove br tag + # s = re.sub(r"", "", s) + m = re.findall(r']*>(.*?)', line1, re.IGNORECASE) + if m: + if len(m) != len(self.field_list): + logging.warning("The number of fields does not seem correct: " + + ','.join(m)) + self.student_list.append(m) + else: + self.add_header(line1) + + def __iter__(self): + self.idx = 0 + return self + + def __next__(self): + if self.idx >= len(self.student_list): + raise StopIteration + # print(self.idx, self.student_list[self.idx]) + record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ] + self.idx += 1 + return record + + def __str__(self): + # show all fields + return '\n'.join([ str(s) for s in self.student_list]) + + def set_major(self): + idx = self.name_to_idx['program'] + self.name_to_idx['major'] = len(self.field_list) + self.field_list.append('major') + for s in self.student_list: + s.append(get_major_from_program(s[idx])) + + def set_fields(self, fsel): + if len(fsel) == 0: + return + + # output all fields + if len(fsel) == 1 and fsel[0] == 'all': + self.output_fields = list(self.field_list) + return + + self.output_fields = [] + self.add_fields(fsel) + + def add_fields(self, fields): + for f in [ f.lower() for f in fields]: + assert f in self.field_list + self.output_fields.append(f) + +def write_csv(file, students, nl = None): + if nl is None: + csvwriter = csv.writer(file) + else: + csvwriter = csv.writer(file, lineterminator=nl) + for s in students: + csvwriter.writerow(s) + +def is_tr(s): + return s.startswith('') or s.startswith('') + +parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.') +parser.add_argument('infiles', nargs='*', default=[], help='Input files.') +parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') +parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list') +parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.') +parser.add_argument("-v", action='count', default=0, help='Verbose level.') + +args = parser.parse_args() + +if args.v == 1: + logging.basicConfig(level=logging.INFO) +elif args.v > 1: + logging.basicConfig(level=logging.DEBUG) + +logging.debug(args) + +students = Students() + +state = State.START +row = '' +try: + for line in fileinput.input(args.infiles): + # remove spaces at the end of the line, including newline + line = line.rstrip() + + if state == State.ROW: + if is_end_of_row(line): + students.add(row) + state = State.START + else: + row += line + continue + + # looking for a row + if is_tr(line): + row = line + state = State.ROW + +except FileNotFoundError as e: + exit(1) + +students.set_major() +students.set_fields(args.fields) +students.add_fields(args.f) + +if args.o != '': + with open(args.o, 'w', newline='') as csvfile: + write_csv(csvfile, students) +else: + write_csv(sys.stdout, students, '\n') diff --git a/cleanup-roster.py b/cleanup-roster.py index 0623a40..c2af6df 100644 --- a/cleanup-roster.py +++ b/cleanup-roster.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 import sys, argparse, fileinput, logging import csv, re +import openpyxl + from enum import Enum, auto def is_hot_major(m, checksecond = False): @@ -24,9 +26,13 @@ def is_hot_major(m, checksecond = False): return major def get_major_from_program(prog): - m = re.findall(r"(?: -|/)([^/]+)", prog) + # m = re.findall(r"(?: -|/)([^/]+)", prog) + + majors = re.search("_x000D_(.+)", prog) + assert majors is not None + # print(majors.group()) - assert len(m) > 0 + m = majors.group(1).split("/") major = is_hot_major(m[0]) @@ -53,33 +59,28 @@ def __init__(self): self.field_list = [] self.output_fields = ["section", "id", "netid", "name", "email"] - def add_header(self, line): + def add_header(self, row): if len(self.name_to_idx): return - m = re.findall(r']*>(.*?)', line, re.IGNORECASE) idx = 0 self.field_list = [] - for f in m: - shortname = f.split(maxsplit=1)[0].lower() + for cell in row: + # short name is only the first word + shortname = cell.value.split(maxsplit=1)[0].lower() self.name_to_idx[shortname] = idx self.field_list.append(shortname) idx += 1 logging.info("Field names are: " + ','.join(self.field_list)) - def add(self, line): + def add(self, row): # replace & - logging.debug(line) - line1 = re.sub(r"&", "&", line) - # remove br tag - # s = re.sub(r"", "", s) - m = re.findall(r']*>(.*?)', line1, re.IGNORECASE) - if m: - if len(m) != len(self.field_list): - logging.warning("The number of fields does not seem correct: " + # line1 = re.sub(r"&", "&", line) + + if len(row) != len(self.field_list): + logging.warning("The number of fields does not seem correct: " + ','.join(m)) - self.student_list.append(m) - else: - self.add_header(line1) + m = [ c.value for c in row ] + self.student_list.append(m) def __iter__(self): self.idx = 0 @@ -129,13 +130,25 @@ def write_csv(file, students, nl = None): for s in students: csvwriter.writerow(s) -def is_tr(s): - return s.startswith('') or s.startswith('') + except Exception as e: + print(e) + return False + return True -parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.') +parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.') parser.add_argument('infiles', nargs='*', default=[], help='Input files.') parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list') @@ -153,28 +166,9 @@ def is_end_of_row(s): students = Students() -state = State.START -row = '' -try: - for line in fileinput.input(args.infiles): - # remove spaces at the end of the line, including newline - line = line.rstrip() - - if state == State.ROW: - if is_end_of_row(line): - students.add(row) - state = State.START - else: - row += line - continue - - # looking for a row - if is_tr(line): - row = line - state = State.ROW - -except FileNotFoundError as e: - exit(1) +for file in args.infiles: + logging.info(f"Loading from {file}...") + load_file(file, students) students.set_major() students.set_fields(args.fields)