cleanup-roster-html.py

#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from enum import Enum, auto

def     is_hot_major(m, checksecond = False):
    if checksecond:
        # some records are appended " as Second" and some " Second"
        if not m.endswith("Second"):
            return ""

    if m.startswith("Computer Science & Engineering"):
        major = "CSE"
    elif m.startswith("Computer Science"):
        major = "CS"
    elif m.startswith("Computer Engineering"):
        major = "CompE"
    elif m.startswith("Electrical Engineering"):
        major = "EE"
    else:
        if m.startswith("Computer"):
            logging.warning(f"Program '{m}' starts with Computer.")
        major = ""
    return major

def     get_major_from_program(prog):
    m = re.findall(r"(?: -|/)([^/]+)", prog)

    assert len(m) > 0

    major = is_hot_major(m[0])

    if major == "":
        if len(m) > 1:
            for p in m[1:]:
                major = is_hot_major(p, True)
                if major:
                    break
    if major == "":
        major = m[0]
    logging.debug(f"{m} ==> {major}")
    return major

class State(Enum):
    START = auto()
    ROW = auto()

class   Students:

    def     __init__(self):
        self.name_to_idx = {}
        self.student_list = []
        self.field_list = []
        self.output_fields = ["section", "id", "netid", "name", "email"]

    def     add_header(self, line):
        if len(self.name_to_idx):
            return
        m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
        idx = 0
        self.field_list = []
        for f in m:
            shortname = f.split(maxsplit=1)[0].lower()
            self.name_to_idx[shortname] = idx
            self.field_list.append(shortname)
            idx += 1
        logging.info("Field names are: " + ','.join(self.field_list))

    def     add(self, line):
        # replace &amp;
        logging.debug(line)
        line1 = re.sub(r"&amp;", "&", line)
        # remove br tag
        # s = re.sub(r"<br\s*/>", "", s)
        m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
        if m:
            if len(m) != len(self.field_list):
                logging.warning("The number of fields does not seem correct: "
                                + ','.join(m))
            self.student_list.append(m)
        else:
            self.add_header(line1)

    def     __iter__(self):
        self.idx = 0
        return self

    def     __next__(self):
        if self.idx >= len(self.student_list):
            raise StopIteration
        # print(self.idx, self.student_list[self.idx])
        record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
        self.idx += 1
        return record

    def     __str__(self):
        # show all fields
        return '\n'.join([ str(s) for s in self.student_list])

    def     set_major(self):
        idx = self.name_to_idx['program']
        self.name_to_idx['major'] = len(self.field_list)
        self.field_list.append('major')
        for s in self.student_list:
            s.append(get_major_from_program(s[idx]))

    def     set_fields(self, fsel):
        if len(fsel) == 0:
            return

        # output all fields
        if len(fsel) == 1 and fsel[0] == 'all':
            self.output_fields = list(self.field_list)
            return

        self.output_fields = []
        self.add_fields(fsel)

    def     add_fields(self, fields):
        for f in [ f.lower() for f in fields]:
            assert f in self.field_list
            self.output_fields.append(f)

def     write_csv(file, students, nl = None):
    if nl is None:
        csvwriter = csv.writer(file)
    else:
        csvwriter = csv.writer(file, lineterminator=nl)
    for s in students:
        csvwriter.writerow(s)

def     is_tr(s):
    return s.startswith('<tr')

def     is_end_of_row(s):
    return s.startswith('<tr>') or s.startswith('</table>')

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')

args = parser.parse_args()

if args.v == 1:
    logging.basicConfig(level=logging.INFO)
elif args.v > 1:
    logging.basicConfig(level=logging.DEBUG)

logging.debug(args)

students = Students()

state = State.START
row = ''
try:
    for line in fileinput.input(args.infiles):
        # remove spaces at the end of the line, including newline
        line = line.rstrip()

        if state == State.ROW:
            if is_end_of_row(line):
                students.add(row)
                state = State.START
            else:
                row += line
                continue

        # looking for a row
        if is_tr(line):
            row = line
            state = State.ROW

except FileNotFoundError as e:
    exit(1)

students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)

if args.o != '':
    with open(args.o, 'w', newline='') as csvfile:
        write_csv(csvfile, students)
else:
    write_csv(sys.stdout, students, '\n')
	#!/usr/bin/python3
	import sys, argparse, fileinput, logging
	import csv, re
	from enum import Enum, auto

	def is_hot_major(m, checksecond = False):
	if checksecond:
	# some records are appended " as Second" and some " Second"
	if not m.endswith("Second"):
	return ""

	if m.startswith("Computer Science & Engineering"):
	major = "CSE"
	elif m.startswith("Computer Science"):
	major = "CS"
	elif m.startswith("Computer Engineering"):
	major = "CompE"
	elif m.startswith("Electrical Engineering"):
	major = "EE"
	else:
	if m.startswith("Computer"):
	logging.warning(f"Program '{m}' starts with Computer.")
	major = ""
	return major

	def get_major_from_program(prog):
	m = re.findall(r"(?: -\|/)([^/]+)", prog)

	assert len(m) > 0

	major = is_hot_major(m[0])

	if major == "":
	if len(m) > 1:
	for p in m[1:]:
	major = is_hot_major(p, True)
	if major:
	break
	if major == "":
	major = m[0]
	logging.debug(f"{m} ==> {major}")
	return major

	class State(Enum):
	START = auto()
	ROW = auto()

	class Students:

	def __init__(self):
	self.name_to_idx = {}
	self.student_list = []
	self.field_list = []
	self.output_fields = ["section", "id", "netid", "name", "email"]

	def add_header(self, line):
	if len(self.name_to_idx):
	return
	m = re.findall(r'<th[^>]>(.?)</th>', line, re.IGNORECASE)
	idx = 0
	self.field_list = []
	for f in m:
	shortname = f.split(maxsplit=1)[0].lower()
	self.name_to_idx[shortname] = idx
	self.field_list.append(shortname)
	idx += 1
	logging.info("Field names are: " + ','.join(self.field_list))

	def add(self, line):
	# replace &
	logging.debug(line)
	line1 = re.sub(r"&", "&", line)
	# remove br tag
	# s = re.sub(r"<br\s*/>", "", s)
	m = re.findall(r'<td[^>]>(.?)</td>', line1, re.IGNORECASE)
	if m:
	if len(m) != len(self.field_list):
	logging.warning("The number of fields does not seem correct: "
	+ ','.join(m))
	self.student_list.append(m)
	else:
	self.add_header(line1)

	def __iter__(self):
	self.idx = 0
	return self

	def __next__(self):
	if self.idx >= len(self.student_list):
	raise StopIteration
	# print(self.idx, self.student_list[self.idx])
	record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
	self.idx += 1
	return record

	def __str__(self):
	# show all fields
	return '\n'.join([ str(s) for s in self.student_list])

	def set_major(self):
	idx = self.name_to_idx['program']
	self.name_to_idx['major'] = len(self.field_list)
	self.field_list.append('major')
	for s in self.student_list:
	s.append(get_major_from_program(s[idx]))

	def set_fields(self, fsel):
	if len(fsel) == 0:
	return

	# output all fields
	if len(fsel) == 1 and fsel[0] == 'all':
	self.output_fields = list(self.field_list)
	return

	self.output_fields = []
	self.add_fields(fsel)

	def add_fields(self, fields):
	for f in [ f.lower() for f in fields]:
	assert f in self.field_list
	self.output_fields.append(f)

	def write_csv(file, students, nl = None):
	if nl is None:
	csvwriter = csv.writer(file)
	else:
	csvwriter = csv.writer(file, lineterminator=nl)
	for s in students:
	csvwriter.writerow(s)

	def is_tr(s):
	return s.startswith('<tr')

	def is_end_of_row(s):
	return s.startswith('<tr>') or s.startswith('</table>')

	parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
	parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
	parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
	parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
	parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
	parser.add_argument("-v", action='count', default=0, help='Verbose level.')

	args = parser.parse_args()

	if args.v == 1:
	logging.basicConfig(level=logging.INFO)
	elif args.v > 1:
	logging.basicConfig(level=logging.DEBUG)

	logging.debug(args)

	students = Students()

	state = State.START
	row = ''
	try:
	for line in fileinput.input(args.infiles):
	# remove spaces at the end of the line, including newline
	line = line.rstrip()

	if state == State.ROW:
	if is_end_of_row(line):
	students.add(row)
	state = State.START
	else:
	row += line
	continue

	# looking for a row
	if is_tr(line):
	row = line
	state = State.ROW

	except FileNotFoundError as e:
	exit(1)

	students.set_major()
	students.set_fields(args.fields)
	students.add_fields(args.f)

	if args.o != '':
	with open(args.o, 'w', newline='') as csvfile:
	write_csv(csvfile, students)
	else:
	write_csv(sys.stdout, students, '\n')