Skip to content

Commit

Permalink
update script to read new format
Browse files Browse the repository at this point in the history
  • Loading branch information
Jerry Shi committed Apr 3, 2023
1 parent d17041f commit feb0e56
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 45 deletions.
187 changes: 187 additions & 0 deletions cleanup-roster-old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from enum import Enum, auto

def is_hot_major(m, checksecond = False):
if checksecond:
# some records are appended " as Second" and some " Second"
if not m.endswith("Second"):
return ""

if m.startswith("Computer Science & Engineering"):
major = "CSE"
elif m.startswith("Computer Science"):
major = "CS"
elif m.startswith("Computer Engineering"):
major = "CompE"
elif m.startswith("Electrical Engineering"):
major = "EE"
else:
if m.startswith("Computer"):
logging.warning(f"Program '{m}' starts with Computer.")
major = ""
return major

def get_major_from_program(prog):
m = re.findall(r"(?: -|/)([^/]+)", prog)

assert len(m) > 0

major = is_hot_major(m[0])

if major == "":
if len(m) > 1:
for p in m[1:]:
major = is_hot_major(p, True)
if major:
break
if major == "":
major = m[0]
logging.debug(f"{m} ==> {major}")
return major

class State(Enum):
START = auto()
ROW = auto()

class Students:

def __init__(self):
self.name_to_idx = {}
self.student_list = []
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]

def add_header(self, line):
if len(self.name_to_idx):
return
m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
idx = 0
self.field_list = []
for f in m:
shortname = f.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))

def add(self, line):
# replace &amp;
logging.debug(line)
line1 = re.sub(r"&amp;", "&", line)
# remove br tag
# s = re.sub(r"<br\s*/>", "", s)
m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
if m:
if len(m) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
self.student_list.append(m)
else:
self.add_header(line1)

def __iter__(self):
self.idx = 0
return self

def __next__(self):
if self.idx >= len(self.student_list):
raise StopIteration
# print(self.idx, self.student_list[self.idx])
record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
self.idx += 1
return record

def __str__(self):
# show all fields
return '\n'.join([ str(s) for s in self.student_list])

def set_major(self):
idx = self.name_to_idx['program']
self.name_to_idx['major'] = len(self.field_list)
self.field_list.append('major')
for s in self.student_list:
s.append(get_major_from_program(s[idx]))

def set_fields(self, fsel):
if len(fsel) == 0:
return

# output all fields
if len(fsel) == 1 and fsel[0] == 'all':
self.output_fields = list(self.field_list)
return

self.output_fields = []
self.add_fields(fsel)

def add_fields(self, fields):
for f in [ f.lower() for f in fields]:
assert f in self.field_list
self.output_fields.append(f)

def write_csv(file, students, nl = None):
if nl is None:
csvwriter = csv.writer(file)
else:
csvwriter = csv.writer(file, lineterminator=nl)
for s in students:
csvwriter.writerow(s)

def is_tr(s):
return s.startswith('<tr')

def is_end_of_row(s):
return s.startswith('<tr>') or s.startswith('</table>')

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')

args = parser.parse_args()

if args.v == 1:
logging.basicConfig(level=logging.INFO)
elif args.v > 1:
logging.basicConfig(level=logging.DEBUG)

logging.debug(args)

students = Students()

state = State.START
row = ''
try:
for line in fileinput.input(args.infiles):
# remove spaces at the end of the line, including newline
line = line.rstrip()

if state == State.ROW:
if is_end_of_row(line):
students.add(row)
state = State.START
else:
row += line
continue

# looking for a row
if is_tr(line):
row = line
state = State.ROW

except FileNotFoundError as e:
exit(1)

students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)

if args.o != '':
with open(args.o, 'w', newline='') as csvfile:
write_csv(csvfile, students)
else:
write_csv(sys.stdout, students, '\n')
84 changes: 39 additions & 45 deletions cleanup-roster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
import openpyxl

from enum import Enum, auto

def is_hot_major(m, checksecond = False):
Expand All @@ -24,9 +26,13 @@ def is_hot_major(m, checksecond = False):
return major

def get_major_from_program(prog):
m = re.findall(r"(?: -|/)([^/]+)", prog)
# m = re.findall(r"(?: -|/)([^/]+)", prog)

majors = re.search("_x000D_(.+)", prog)
assert majors is not None
# print(majors.group())

assert len(m) > 0
m = majors.group(1).split("/")

major = is_hot_major(m[0])

Expand All @@ -53,33 +59,28 @@ def __init__(self):
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]

def add_header(self, line):
def add_header(self, row):
if len(self.name_to_idx):
return
m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
idx = 0
self.field_list = []
for f in m:
shortname = f.split(maxsplit=1)[0].lower()
for cell in row:
# short name is only the first word
shortname = cell.value.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))

def add(self, line):
def add(self, row):
# replace &amp;
logging.debug(line)
line1 = re.sub(r"&amp;", "&", line)
# remove br tag
# s = re.sub(r"<br\s*/>", "", s)
m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
if m:
if len(m) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
# line1 = re.sub(r"&amp;", "&", line)

if len(row) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
self.student_list.append(m)
else:
self.add_header(line1)
m = [ c.value for c in row ]
self.student_list.append(m)

def __iter__(self):
self.idx = 0
Expand Down Expand Up @@ -129,13 +130,25 @@ def write_csv(file, students, nl = None):
for s in students:
csvwriter.writerow(s)

def is_tr(s):
return s.startswith('<tr')
def load_file(file, students):
try:
wb = openpyxl.load_workbook(file, data_only=True)
worksheet = wb.worksheets[0]

first = True
for row in worksheet.rows:
if first:
students.add_header(row)
first = False
else:
students.add(row)

def is_end_of_row(s):
return s.startswith('<tr>') or s.startswith('</table>')
except Exception as e:
print(e)
return False
return True

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
Expand All @@ -153,28 +166,9 @@ def is_end_of_row(s):

students = Students()

state = State.START
row = ''
try:
for line in fileinput.input(args.infiles):
# remove spaces at the end of the line, including newline
line = line.rstrip()

if state == State.ROW:
if is_end_of_row(line):
students.add(row)
state = State.START
else:
row += line
continue

# looking for a row
if is_tr(line):
row = line
state = State.ROW

except FileNotFoundError as e:
exit(1)
for file in args.infiles:
logging.info(f"Loading from {file}...")
load_file(file, students)

students.set_major()
students.set_fields(args.fields)
Expand Down

0 comments on commit feb0e56

Please sign in to comment.