Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from html.parser import HTMLParser
from enum import Enum, auto
def is_hot_major(m, checksecond = False):
if checksecond:
# some records are appended " as Second" and some " Second"
if not m.endswith("Second"):
return ""
if m.startswith("Computer Science & Engineering"):
major = "CSE"
elif m.startswith("Computer Science"):
major = "CS"
elif m.startswith("Computer Engineering"):
major = "CompE"
elif m.startswith("Electrical Engineering"):
major = "EE"
else:
if m.startswith("Computer"):
logging.warning(f"Program '{m}' starts with Computer.")
major = ""
return major
def get_major_from_program(prog):
# m = re.findall(r"(?: -|/)([^/]+)", prog)
# majors = re.search("_x000D_(.+)", prog)
# prog = re.sub(r"&", "&", prog)
# print(prog)
majors = re.search(r" -\s*(.+)", prog)
assert majors is not None
# print(majors.group())
m = majors.group(1).split("/")
major = is_hot_major(m[0])
if major == "":
if len(m) > 1:
for p in m[1:]:
major = is_hot_major(p, True)
if major:
break
if major == "":
major = m[0]
logging.debug(f"{m} ==> {major}")
return major
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.table = []
self.row = []
self.th = False
self.td = False
def handle_starttag(self, tag, attrs):
# print("Start tag:", tag)
# for attr in attrs:
# print(" attr:", attr)
if tag == "tr":
self.end_of_row()
elif tag == "th":
self.th = True
self.data = ""
elif tag == "td":
self.td = True
self.data = ""
def handle_endtag(self, tag):
# print("End tag :", tag)
if tag in ["th", "td"]:
self.row.append(self.data)
self.th = False
self.td = False
if tag == "table":
self.end_of_row()
def handle_data(self, data):
if self.th or self.td:
self.data = data.replace("\n", " ")
def end_of_row(self):
if self.row:
self.table.append(self.row)
self.row = []
class Students:
def __init__(self):
self.name_to_idx = {}
self.student_list = []
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]
def add_header(self, row):
if len(self.name_to_idx):
return
idx = 0
self.field_list = []
for cell in row:
# short name is only the first word
shortname = cell.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))
def add(self, row):
# replace &
# line1 = re.sub(r"&", "&", line)
if len(row) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(row))
self.student_list.append(row)
# print(len(self.student_list), row)
def get_field_list(self):
return self.field_list
def __iter__(self):
self.idx = 0
return self
def __next__(self):
if self.idx >= len(self.student_list):
raise StopIteration
# print(self.idx, self.student_list[self.idx])
record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
self.idx += 1
return record
def __str__(self):
# show all fields
return '\n'.join([ str(s) for s in self.student_list])
def set_major(self):
idx = self.name_to_idx['program']
self.name_to_idx['major'] = len(self.field_list)
self.field_list.append('major')
for s in self.student_list:
s.append(get_major_from_program(s[idx]))
def set_fields(self, fsel):
if len(fsel) == 0:
return
# output all fields
if len(fsel) == 1 and fsel[0] == 'all':
self.output_fields = list(self.field_list)
return
self.output_fields = []
self.add_fields(fsel)
def add_fields(self, fields):
for f in [ f.lower() for f in fields]:
assert f in self.field_list
self.output_fields.append(f)
def write_csv(file, students, nl = None):
if nl is None:
csvwriter = csv.writer(file)
else:
csvwriter = csv.writer(file, lineterminator=nl)
for s in students:
csvwriter.writerow(s)
# load from real xlsx files.
def load_file_xlsx(file, students):
import openpyxl
try:
wb = openpyxl.load_workbook(file, data_only=True)
worksheet = wb.worksheets[0]
first = True
for row in worksheet.rows:
m = [ c.value for c in row ]
if first:
students.add_header(m)
first = False
else:
students.add(m)
except Exception as e:
print(e)
return False
return True
### Helper functions
def is_tr(s):
return s.startswith('<tr')
def is_end_of_row(s):
return s.startswith('<tr') or s.startswith('</table>')
# load from HTML file
def load_html_file(file, students):
try:
parser = MyHTMLParser()
with open(file, 'r', encoding='utf-8') as file:
lines = file.read()
parser.feed(lines)
first = True
for row in parser.table:
if first:
students.add_header(row)
first = False
else:
students.add(row)
except FileNotFoundError as e:
print(e)
return False
return True
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument('--listfields', action='store_true', help='List fields.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')
args = parser.parse_args()
if args.v == 1:
logging.basicConfig(level=logging.INFO)
elif args.v > 1:
logging.basicConfig(level=logging.DEBUG)
logging.debug(args)
students = Students()
for file in args.infiles:
logging.info(f"Loading from {file}...")
if not load_html_file(file, students):
exit(1)
if args.listfields:
print(students.get_field_list())
exit(0)
students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)
if args.o != '':
with open(args.o, 'w', newline='') as csvfile:
write_csv(csvfile, students)
else:
write_csv(sys.stdout, students, '\n')