Skip to content
Permalink
6c0c854444
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
187 lines (154 sloc) 5.35 KB
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from enum import Enum, auto
def is_hot_major(m, checksecond = False):
if checksecond:
# some records are appended " as Second" and some " Second"
if not m.endswith("Second"):
return ""
if m.startswith("Computer Science & Engineering"):
major = "CSE"
elif m.startswith("Computer Science"):
major = "CS"
elif m.startswith("Computer Engineering"):
major = "CompE"
elif m.startswith("Electrical Engineering"):
major = "EE"
else:
if m.startswith("Computer"):
logging.warning(f"Program '{m}' starts with Computer.")
major = ""
return major
def get_major_from_program(prog):
m = re.findall(r"(?: -|/)([^/]+)", prog)
assert len(m) > 0
major = is_hot_major(m[0])
if major == "":
if len(m) > 1:
for p in m[1:]:
major = is_hot_major(p, True)
if major:
break
if major == "":
major = m[0]
logging.debug(f"{m} ==> {major}")
return major
class State(Enum):
START = auto()
ROW = auto()
class Students:
def __init__(self):
self.name_to_idx = {}
self.student_list = []
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]
def add_header(self, line):
if len(self.name_to_idx):
return
m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
idx = 0
self.field_list = []
for f in m:
shortname = f.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))
def add(self, line):
# replace &amp;
logging.debug(line)
line1 = re.sub(r"&amp;", "&", line)
# remove br tag
# s = re.sub(r"<br\s*/>", "", s)
m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
if m:
if len(m) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
self.student_list.append(m)
else:
self.add_header(line1)
def __iter__(self):
self.idx = 0
return self
def __next__(self):
if self.idx >= len(self.student_list):
raise StopIteration
# print(self.idx, self.student_list[self.idx])
record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
self.idx += 1
return record
def __str__(self):
# show all fields
return '\n'.join([ str(s) for s in self.student_list])
def set_major(self):
idx = self.name_to_idx['program']
self.name_to_idx['major'] = len(self.field_list)
self.field_list.append('major')
for s in self.student_list:
s.append(get_major_from_program(s[idx]))
def set_fields(self, fsel):
if len(fsel) == 0:
return
# output all fields
if len(fsel) == 1 and fsel[0] == 'all':
self.output_fields = list(self.field_list)
return
self.output_fields = []
self.add_fields(fsel)
def add_fields(self, fields):
for f in [ f.lower() for f in fields]:
assert f in self.field_list
self.output_fields.append(f)
def write_csv(file, students, nl = None):
if nl is None:
csvwriter = csv.writer(file)
else:
csvwriter = csv.writer(file, lineterminator=nl)
for s in students:
csvwriter.writerow(s)
def is_tr(s):
return s.startswith('<tr')
def is_end_of_row(s):
return s.startswith('<tr>') or s.startswith('</table>')
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')
args = parser.parse_args()
if args.v == 1:
logging.basicConfig(level=logging.INFO)
elif args.v > 1:
logging.basicConfig(level=logging.DEBUG)
logging.debug(args)
students = Students()
state = State.START
row = ''
try:
for line in fileinput.input(args.infiles):
# remove spaces at the end of the line, including newline
line = line.rstrip()
if state == State.ROW:
if is_end_of_row(line):
students.add(row)
state = State.START
else:
row += line
continue
# looking for a row
if is_tr(line):
row = line
state = State.ROW
except FileNotFoundError as e:
exit(1)
students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)
if args.o != '':
with open(args.o, 'w', newline='') as csvfile:
write_csv(csvfile, students)
else:
write_csv(sys.stdout, students, '\n')