cleanup-roster.py

#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from html.parser import HTMLParser

from enum import Enum, auto

def     is_hot_major(m, checksecond = False):
    if checksecond:
        # some records are appended " as Second" and some " Second"
        if not m.endswith("Second"):
            return ""

    if m.startswith("Computer Science & Engineering"):
        major = "CSE"
    elif m.startswith("Computer Science"):
        major = "CS"
    elif m.startswith("Computer Engineering"):
        major = "CompE"
    elif m.startswith("Electrical Engineering"):
        major = "EE"
    else:
        if m.startswith("Computer"):
            logging.warning(f"Program '{m}' starts with Computer.")
        major = ""
    return major

def     get_major_from_program(prog):
    # m = re.findall(r"(?: -|/)([^/]+)", prog)
    # majors = re.search("_x000D_(.+)", prog)
    # prog = re.sub(r"&amp;", "&", prog)
    # print(prog)
    majors = re.search(r" -\s*(.+)", prog)
    assert  majors is not None
    # print(majors.group())

    m = majors.group(1).split("/")

    major = is_hot_major(m[0])

    if major == "":
        if len(m) > 1:
            for p in m[1:]:
                major = is_hot_major(p, True)
                if major:
                    break
    if major == "":
        major = m[0]
    logging.debug(f"{m} ==> {major}")
    return major

class MyHTMLParser(HTMLParser):

    def __init__(self):
        super().__init__()
        self.table = []
        self.row = []
        self.th = False
        self.td = False

    def handle_starttag(self, tag, attrs):
        # print("Start tag:", tag)
        # for attr in attrs:
        #    print("     attr:", attr)

        if tag == "tr":
            self.end_of_row()
        elif tag == "th":
            self.th = True
            self.data = ""
        elif tag == "td":
            self.td = True
            self.data = ""

    def handle_endtag(self, tag):
        # print("End tag  :", tag)
        if tag in ["th", "td"]:
            self.row.append(self.data)
            self.th = False
            self.td = False

        if tag == "table":
            self.end_of_row()

    def handle_data(self, data):
        if self.th or self.td:
            self.data = data.replace("\n", " ")

    def end_of_row(self):
        if self.row:
            self.table.append(self.row)
        self.row = []


class   Students:

    def     __init__(self):
        self.name_to_idx = {}
        self.student_list = []
        self.field_list = []
        self.output_fields = ["section", "id", "netid", "name", "email"]

    def     add_header(self, row):
        if len(self.name_to_idx):
            return
        idx = 0
        self.field_list = []
        for cell in row:
            # short name is only the first word
            shortname = cell.split(maxsplit=1)[0].lower()
            self.name_to_idx[shortname] = idx
            self.field_list.append(shortname)
            idx += 1
        logging.info("Field names are: " + ','.join(self.field_list))

    def     add(self, row):
        # replace &amp;
        # line1 = re.sub(r"&amp;", "&", line)

        if len(row) != len(self.field_list):
            logging.warning("The number of fields does not seem correct: "
                                + ','.join(row))
        self.student_list.append(row)
        # print(len(self.student_list), row)

    def     get_field_list(self):
        return self.field_list

    def     __iter__(self):
        self.idx = 0
        return self

    def     __next__(self):
        if self.idx >= len(self.student_list):
            raise StopIteration
        # print(self.idx, self.student_list[self.idx])
        record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
        self.idx += 1
        return record

    def     __str__(self):
        # show all fields
        return '\n'.join([ str(s) for s in self.student_list])

    def     set_major(self):
        idx = self.name_to_idx['program']
        self.name_to_idx['major'] = len(self.field_list)
        self.field_list.append('major')
        for s in self.student_list:
            s.append(get_major_from_program(s[idx]))

    def     set_fields(self, fsel):
        if len(fsel) == 0:
            return

        # output all fields
        if len(fsel) == 1 and fsel[0] == 'all':
            self.output_fields = list(self.field_list)
            return

        self.output_fields = []
        self.add_fields(fsel)

    def     add_fields(self, fields):
        for f in [ f.lower() for f in fields]:
            assert f in self.field_list
            self.output_fields.append(f)

def     write_csv(file, students, nl = None):
    if nl is None:
        csvwriter = csv.writer(file)
    else:
        csvwriter = csv.writer(file, lineterminator=nl)
    for s in students:
        csvwriter.writerow(s)

# load from real xlsx files.
def load_file_xlsx(file, students):
    import openpyxl
    try:
        wb = openpyxl.load_workbook(file, data_only=True)
        worksheet = wb.worksheets[0]

        first = True
        for row in worksheet.rows:
            m = [ c.value for c in row ]
            if first:
                students.add_header(m)
                first = False
            else:
                students.add(m)

    except Exception as e:
        print(e)
        return False
    return True

### Helper functions
def     is_tr(s):
    return s.startswith('<tr')

def     is_end_of_row(s):
    return s.startswith('<tr') or s.startswith('</table>')

# load from HTML file
def load_html_file(file, students):
    try:
        parser = MyHTMLParser()
        with open(file, 'r', encoding='utf-8') as file:
            lines = file.read()

        parser.feed(lines)

        first = True
        for row in parser.table:
            if first:
                students.add_header(row)
                first = False
            else:
                students.add(row)

    except FileNotFoundError as e:
        print(e)
        return False
    return True

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument('--listfields', action='store_true', help='List fields.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')

args = parser.parse_args()

if args.v == 1:
    logging.basicConfig(level=logging.INFO)
elif args.v > 1:
    logging.basicConfig(level=logging.DEBUG)

logging.debug(args)

students = Students()

for file in args.infiles:
    logging.info(f"Loading from {file}...")
    if not load_html_file(file, students):
        exit(1)

if args.listfields:
    print(students.get_field_list())
    exit(0)

students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)

if args.o != '':
    with open(args.o, 'w', newline='') as csvfile:
        write_csv(csvfile, students)
else:
    write_csv(sys.stdout, students, '\n')
	#!/usr/bin/python3
	import sys, argparse, fileinput, logging
	import csv, re
	from html.parser import HTMLParser

	from enum import Enum, auto

	def is_hot_major(m, checksecond = False):
	if checksecond:
	# some records are appended " as Second" and some " Second"
	if not m.endswith("Second"):
	return ""

	if m.startswith("Computer Science & Engineering"):
	major = "CSE"
	elif m.startswith("Computer Science"):
	major = "CS"
	elif m.startswith("Computer Engineering"):
	major = "CompE"
	elif m.startswith("Electrical Engineering"):
	major = "EE"
	else:
	if m.startswith("Computer"):
	logging.warning(f"Program '{m}' starts with Computer.")
	major = ""
	return major

	def get_major_from_program(prog):
	# m = re.findall(r"(?: -\|/)([^/]+)", prog)
	# majors = re.search("_x000D_(.+)", prog)
	# prog = re.sub(r"&", "&", prog)
	# print(prog)
	majors = re.search(r" -\s*(.+)", prog)
	assert majors is not None
	# print(majors.group())

	m = majors.group(1).split("/")

	major = is_hot_major(m[0])

	if major == "":
	if len(m) > 1:
	for p in m[1:]:
	major = is_hot_major(p, True)
	if major:
	break
	if major == "":
	major = m[0]
	logging.debug(f"{m} ==> {major}")
	return major

	class MyHTMLParser(HTMLParser):

	def __init__(self):
	super().__init__()
	self.table = []
	self.row = []
	self.th = False
	self.td = False

	def handle_starttag(self, tag, attrs):
	# print("Start tag:", tag)
	# for attr in attrs:
	# print(" attr:", attr)

	if tag == "tr":
	self.end_of_row()
	elif tag == "th":
	self.th = True
	self.data = ""
	elif tag == "td":
	self.td = True
	self.data = ""

	def handle_endtag(self, tag):
	# print("End tag :", tag)
	if tag in ["th", "td"]:
	self.row.append(self.data)
	self.th = False
	self.td = False

	if tag == "table":
	self.end_of_row()

	def handle_data(self, data):
	if self.th or self.td:
	self.data = data.replace("\n", " ")

	def end_of_row(self):
	if self.row:
	self.table.append(self.row)
	self.row = []


	class Students:

	def __init__(self):
	self.name_to_idx = {}
	self.student_list = []
	self.field_list = []
	self.output_fields = ["section", "id", "netid", "name", "email"]

	def add_header(self, row):
	if len(self.name_to_idx):
	return
	idx = 0
	self.field_list = []
	for cell in row:
	# short name is only the first word
	shortname = cell.split(maxsplit=1)[0].lower()
	self.name_to_idx[shortname] = idx
	self.field_list.append(shortname)
	idx += 1
	logging.info("Field names are: " + ','.join(self.field_list))

	def add(self, row):
	# replace &
	# line1 = re.sub(r"&", "&", line)

	if len(row) != len(self.field_list):
	logging.warning("The number of fields does not seem correct: "
	+ ','.join(row))
	self.student_list.append(row)
	# print(len(self.student_list), row)

	def get_field_list(self):
	return self.field_list

	def __iter__(self):
	self.idx = 0
	return self

	def __next__(self):
	if self.idx >= len(self.student_list):
	raise StopIteration
	# print(self.idx, self.student_list[self.idx])
	record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
	self.idx += 1
	return record

	def __str__(self):
	# show all fields
	return '\n'.join([ str(s) for s in self.student_list])

	def set_major(self):
	idx = self.name_to_idx['program']
	self.name_to_idx['major'] = len(self.field_list)
	self.field_list.append('major')
	for s in self.student_list:
	s.append(get_major_from_program(s[idx]))

	def set_fields(self, fsel):
	if len(fsel) == 0:
	return

	# output all fields
	if len(fsel) == 1 and fsel[0] == 'all':
	self.output_fields = list(self.field_list)
	return

	self.output_fields = []
	self.add_fields(fsel)

	def add_fields(self, fields):
	for f in [ f.lower() for f in fields]:
	assert f in self.field_list
	self.output_fields.append(f)

	def write_csv(file, students, nl = None):
	if nl is None:
	csvwriter = csv.writer(file)
	else:
	csvwriter = csv.writer(file, lineterminator=nl)
	for s in students:
	csvwriter.writerow(s)

	# load from real xlsx files.
	def load_file_xlsx(file, students):
	import openpyxl
	try:
	wb = openpyxl.load_workbook(file, data_only=True)
	worksheet = wb.worksheets[0]

	first = True
	for row in worksheet.rows:
	m = [ c.value for c in row ]
	if first:
	students.add_header(m)
	first = False
	else:
	students.add(m)

	except Exception as e:
	print(e)
	return False
	return True

	### Helper functions
	def is_tr(s):
	return s.startswith('<tr')

	def is_end_of_row(s):
	return s.startswith('<tr') or s.startswith('</table>')

	# load from HTML file
	def load_html_file(file, students):
	try:
	parser = MyHTMLParser()
	with open(file, 'r', encoding='utf-8') as file:
	lines = file.read()

	parser.feed(lines)

	first = True
	for row in parser.table:
	if first:
	students.add_header(row)
	first = False
	else:
	students.add(row)

	except FileNotFoundError as e:
	print(e)
	return False
	return True

	parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
	parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
	parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
	parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
	parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
	parser.add_argument('--listfields', action='store_true', help='List fields.')
	parser.add_argument("-v", action='count', default=0, help='Verbose level.')

	args = parser.parse_args()

	if args.v == 1:
	logging.basicConfig(level=logging.INFO)
	elif args.v > 1:
	logging.basicConfig(level=logging.DEBUG)

	logging.debug(args)

	students = Students()

	for file in args.infiles:
	logging.info(f"Loading from {file}...")
	if not load_html_file(file, students):
	exit(1)

	if args.listfields:
	print(students.get_field_list())
	exit(0)

	students.set_major()
	students.set_fields(args.fields)
	students.add_fields(args.f)

	if args.o != '':
	with open(args.o, 'w', newline='') as csvfile:
	write_csv(csvfile, students)
	else:
	write_csv(sys.stdout, students, '\n')