Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
cleanup-roster/cleanup-roster.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
263 lines (214 sloc)
7.21 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import sys, argparse, fileinput, logging | |
import csv, re | |
from html.parser import HTMLParser | |
from enum import Enum, auto | |
def is_hot_major(m, checksecond = False): | |
if checksecond: | |
# some records are appended " as Second" and some " Second" | |
if not m.endswith("Second"): | |
return "" | |
if m.startswith("Computer Science & Engineering"): | |
major = "CSE" | |
elif m.startswith("Computer Science"): | |
major = "CS" | |
elif m.startswith("Computer Engineering"): | |
major = "CompE" | |
elif m.startswith("Electrical Engineering"): | |
major = "EE" | |
else: | |
if m.startswith("Computer"): | |
logging.warning(f"Program '{m}' starts with Computer.") | |
major = "" | |
return major | |
def get_major_from_program(prog): | |
# m = re.findall(r"(?: -|/)([^/]+)", prog) | |
# majors = re.search("_x000D_(.+)", prog) | |
# prog = re.sub(r"&", "&", prog) | |
# print(prog) | |
majors = re.search(r" -\s*(.+)", prog) | |
assert majors is not None | |
# print(majors.group()) | |
m = majors.group(1).split("/") | |
major = is_hot_major(m[0]) | |
if major == "": | |
if len(m) > 1: | |
for p in m[1:]: | |
major = is_hot_major(p, True) | |
if major: | |
break | |
if major == "": | |
major = m[0] | |
logging.debug(f"{m} ==> {major}") | |
return major | |
class MyHTMLParser(HTMLParser): | |
def __init__(self): | |
super().__init__() | |
self.table = [] | |
self.row = [] | |
self.th = False | |
self.td = False | |
def handle_starttag(self, tag, attrs): | |
# print("Start tag:", tag) | |
# for attr in attrs: | |
# print(" attr:", attr) | |
if tag == "tr": | |
self.end_of_row() | |
elif tag == "th": | |
self.th = True | |
self.data = "" | |
elif tag == "td": | |
self.td = True | |
self.data = "" | |
def handle_endtag(self, tag): | |
# print("End tag :", tag) | |
if tag in ["th", "td"]: | |
self.row.append(self.data) | |
self.th = False | |
self.td = False | |
if tag == "table": | |
self.end_of_row() | |
def handle_data(self, data): | |
if self.th or self.td: | |
self.data = data.replace("\n", " ") | |
def end_of_row(self): | |
if self.row: | |
self.table.append(self.row) | |
self.row = [] | |
class Students: | |
def __init__(self): | |
self.name_to_idx = {} | |
self.student_list = [] | |
self.field_list = [] | |
self.output_fields = ["section", "id", "netid", "name", "email"] | |
def add_header(self, row): | |
if len(self.name_to_idx): | |
return | |
idx = 0 | |
self.field_list = [] | |
for cell in row: | |
# short name is only the first word | |
shortname = cell.split(maxsplit=1)[0].lower() | |
self.name_to_idx[shortname] = idx | |
self.field_list.append(shortname) | |
idx += 1 | |
logging.info("Field names are: " + ','.join(self.field_list)) | |
def add(self, row): | |
# replace & | |
# line1 = re.sub(r"&", "&", line) | |
if len(row) != len(self.field_list): | |
logging.warning("The number of fields does not seem correct: " | |
+ ','.join(row)) | |
self.student_list.append(row) | |
# print(len(self.student_list), row) | |
def get_field_list(self): | |
return self.field_list | |
def __iter__(self): | |
self.idx = 0 | |
return self | |
def __next__(self): | |
if self.idx >= len(self.student_list): | |
raise StopIteration | |
# print(self.idx, self.student_list[self.idx]) | |
record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ] | |
self.idx += 1 | |
return record | |
def __str__(self): | |
# show all fields | |
return '\n'.join([ str(s) for s in self.student_list]) | |
def set_major(self): | |
idx = self.name_to_idx['program'] | |
self.name_to_idx['major'] = len(self.field_list) | |
self.field_list.append('major') | |
for s in self.student_list: | |
s.append(get_major_from_program(s[idx])) | |
def set_fields(self, fsel): | |
if len(fsel) == 0: | |
return | |
# output all fields | |
if len(fsel) == 1 and fsel[0] == 'all': | |
self.output_fields = list(self.field_list) | |
return | |
self.output_fields = [] | |
self.add_fields(fsel) | |
def add_fields(self, fields): | |
for f in [ f.lower() for f in fields]: | |
assert f in self.field_list | |
self.output_fields.append(f) | |
def write_csv(file, students, nl = None): | |
if nl is None: | |
csvwriter = csv.writer(file) | |
else: | |
csvwriter = csv.writer(file, lineterminator=nl) | |
for s in students: | |
csvwriter.writerow(s) | |
# load from real xlsx files. | |
def load_file_xlsx(file, students): | |
import openpyxl | |
try: | |
wb = openpyxl.load_workbook(file, data_only=True) | |
worksheet = wb.worksheets[0] | |
first = True | |
for row in worksheet.rows: | |
m = [ c.value for c in row ] | |
if first: | |
students.add_header(m) | |
first = False | |
else: | |
students.add(m) | |
except Exception as e: | |
print(e) | |
return False | |
return True | |
### Helper functions | |
def is_tr(s): | |
return s.startswith('<tr') | |
def is_end_of_row(s): | |
return s.startswith('<tr') or s.startswith('</table>') | |
# load from HTML file | |
def load_html_file(file, students): | |
try: | |
parser = MyHTMLParser() | |
with open(file, 'r', encoding='utf-8') as file: | |
lines = file.read() | |
parser.feed(lines) | |
first = True | |
for row in parser.table: | |
if first: | |
students.add_header(row) | |
first = False | |
else: | |
students.add(row) | |
except FileNotFoundError as e: | |
print(e) | |
return False | |
return True | |
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.') | |
parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.') | |
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') | |
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list') | |
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.') | |
parser.add_argument('--listfields', action='store_true', help='List fields.') | |
parser.add_argument("-v", action='count', default=0, help='Verbose level.') | |
args = parser.parse_args() | |
if args.v == 1: | |
logging.basicConfig(level=logging.INFO) | |
elif args.v > 1: | |
logging.basicConfig(level=logging.DEBUG) | |
logging.debug(args) | |
students = Students() | |
for file in args.infiles: | |
logging.info(f"Loading from {file}...") | |
if not load_html_file(file, students): | |
exit(1) | |
if args.listfields: | |
print(students.get_field_list()) | |
exit(0) | |
students.set_major() | |
students.set_fields(args.fields) | |
students.add_fields(args.f) | |
if args.o != '': | |
with open(args.o, 'w', newline='') as csvfile: | |
write_csv(csvfile, students) | |
else: | |
write_csv(sys.stdout, students, '\n') |