diff --git a/cleanup-roster-old.py b/cleanup-roster-old.py
new file mode 100644
index 0000000..0623a40
--- /dev/null
+++ b/cleanup-roster-old.py
@@ -0,0 +1,187 @@
+#!/usr/bin/python3
+import sys, argparse, fileinput, logging
+import csv, re
+from enum import Enum, auto
+
+def is_hot_major(m, checksecond = False):
+ if checksecond:
+ # some records are appended " as Second" and some " Second"
+ if not m.endswith("Second"):
+ return ""
+
+ if m.startswith("Computer Science & Engineering"):
+ major = "CSE"
+ elif m.startswith("Computer Science"):
+ major = "CS"
+ elif m.startswith("Computer Engineering"):
+ major = "CompE"
+ elif m.startswith("Electrical Engineering"):
+ major = "EE"
+ else:
+ if m.startswith("Computer"):
+ logging.warning(f"Program '{m}' starts with Computer.")
+ major = ""
+ return major
+
+def get_major_from_program(prog):
+ m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+ assert len(m) > 0
+
+ major = is_hot_major(m[0])
+
+ if major == "":
+ if len(m) > 1:
+ for p in m[1:]:
+ major = is_hot_major(p, True)
+ if major:
+ break
+ if major == "":
+ major = m[0]
+ logging.debug(f"{m} ==> {major}")
+ return major
+
+class State(Enum):
+ START = auto()
+ ROW = auto()
+
+class Students:
+
+ def __init__(self):
+ self.name_to_idx = {}
+ self.student_list = []
+ self.field_list = []
+ self.output_fields = ["section", "id", "netid", "name", "email"]
+
+ def add_header(self, line):
+ if len(self.name_to_idx):
+ return
+ m = re.findall(r'
]*>(.*?) | ', line, re.IGNORECASE)
+ idx = 0
+ self.field_list = []
+ for f in m:
+ shortname = f.split(maxsplit=1)[0].lower()
+ self.name_to_idx[shortname] = idx
+ self.field_list.append(shortname)
+ idx += 1
+ logging.info("Field names are: " + ','.join(self.field_list))
+
+ def add(self, line):
+ # replace &
+ logging.debug(line)
+ line1 = re.sub(r"&", "&", line)
+ # remove br tag
+ # s = re.sub(r"
", "", s)
+ m = re.findall(r']*>(.*?) | ', line1, re.IGNORECASE)
+ if m:
+ if len(m) != len(self.field_list):
+ logging.warning("The number of fields does not seem correct: "
+ + ','.join(m))
+ self.student_list.append(m)
+ else:
+ self.add_header(line1)
+
+ def __iter__(self):
+ self.idx = 0
+ return self
+
+ def __next__(self):
+ if self.idx >= len(self.student_list):
+ raise StopIteration
+ # print(self.idx, self.student_list[self.idx])
+ record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
+ self.idx += 1
+ return record
+
+ def __str__(self):
+ # show all fields
+ return '\n'.join([ str(s) for s in self.student_list])
+
+ def set_major(self):
+ idx = self.name_to_idx['program']
+ self.name_to_idx['major'] = len(self.field_list)
+ self.field_list.append('major')
+ for s in self.student_list:
+ s.append(get_major_from_program(s[idx]))
+
+ def set_fields(self, fsel):
+ if len(fsel) == 0:
+ return
+
+ # output all fields
+ if len(fsel) == 1 and fsel[0] == 'all':
+ self.output_fields = list(self.field_list)
+ return
+
+ self.output_fields = []
+ self.add_fields(fsel)
+
+ def add_fields(self, fields):
+ for f in [ f.lower() for f in fields]:
+ assert f in self.field_list
+ self.output_fields.append(f)
+
+def write_csv(file, students, nl = None):
+ if nl is None:
+ csvwriter = csv.writer(file)
+ else:
+ csvwriter = csv.writer(file, lineterminator=nl)
+ for s in students:
+ csvwriter.writerow(s)
+
+def is_tr(s):
+ return s.startswith('') or s.startswith('')
+
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
+parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
+parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
+parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
+parser.add_argument("-v", action='count', default=0, help='Verbose level.')
+
+args = parser.parse_args()
+
+if args.v == 1:
+ logging.basicConfig(level=logging.INFO)
+elif args.v > 1:
+ logging.basicConfig(level=logging.DEBUG)
+
+logging.debug(args)
+
+students = Students()
+
+state = State.START
+row = ''
+try:
+ for line in fileinput.input(args.infiles):
+ # remove spaces at the end of the line, including newline
+ line = line.rstrip()
+
+ if state == State.ROW:
+ if is_end_of_row(line):
+ students.add(row)
+ state = State.START
+ else:
+ row += line
+ continue
+
+ # looking for a row
+ if is_tr(line):
+ row = line
+ state = State.ROW
+
+except FileNotFoundError as e:
+ exit(1)
+
+students.set_major()
+students.set_fields(args.fields)
+students.add_fields(args.f)
+
+if args.o != '':
+ with open(args.o, 'w', newline='') as csvfile:
+ write_csv(csvfile, students)
+else:
+ write_csv(sys.stdout, students, '\n')
diff --git a/cleanup-roster.py b/cleanup-roster.py
index 0623a40..c2af6df 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -1,6 +1,8 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
+import openpyxl
+
from enum import Enum, auto
def is_hot_major(m, checksecond = False):
@@ -24,9 +26,13 @@ def is_hot_major(m, checksecond = False):
return major
def get_major_from_program(prog):
- m = re.findall(r"(?: -|/)([^/]+)", prog)
+ # m = re.findall(r"(?: -|/)([^/]+)", prog)
+
+ majors = re.search("_x000D_(.+)", prog)
+ assert majors is not None
+ # print(majors.group())
- assert len(m) > 0
+ m = majors.group(1).split("/")
major = is_hot_major(m[0])
@@ -53,33 +59,28 @@ def __init__(self):
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]
- def add_header(self, line):
+ def add_header(self, row):
if len(self.name_to_idx):
return
- m = re.findall(r']*>(.*?) | ', line, re.IGNORECASE)
idx = 0
self.field_list = []
- for f in m:
- shortname = f.split(maxsplit=1)[0].lower()
+ for cell in row:
+ # short name is only the first word
+ shortname = cell.value.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))
- def add(self, line):
+ def add(self, row):
# replace &
- logging.debug(line)
- line1 = re.sub(r"&", "&", line)
- # remove br tag
- # s = re.sub(r"
", "", s)
- m = re.findall(r']*>(.*?) | ', line1, re.IGNORECASE)
- if m:
- if len(m) != len(self.field_list):
- logging.warning("The number of fields does not seem correct: "
+ # line1 = re.sub(r"&", "&", line)
+
+ if len(row) != len(self.field_list):
+ logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
- self.student_list.append(m)
- else:
- self.add_header(line1)
+ m = [ c.value for c in row ]
+ self.student_list.append(m)
def __iter__(self):
self.idx = 0
@@ -129,13 +130,25 @@ def write_csv(file, students, nl = None):
for s in students:
csvwriter.writerow(s)
-def is_tr(s):
- return s.startswith('
') or s.startswith('')
+ except Exception as e:
+ print(e)
+ return False
+ return True
-parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
@@ -153,28 +166,9 @@ def is_end_of_row(s):
students = Students()
-state = State.START
-row = ''
-try:
- for line in fileinput.input(args.infiles):
- # remove spaces at the end of the line, including newline
- line = line.rstrip()
-
- if state == State.ROW:
- if is_end_of_row(line):
- students.add(row)
- state = State.START
- else:
- row += line
- continue
-
- # looking for a row
- if is_tr(line):
- row = line
- state = State.ROW
-
-except FileNotFoundError as e:
- exit(1)
+for file in args.infiles:
+ logging.info(f"Loading from {file}...")
+ load_file(file, students)
students.set_major()
students.set_fields(args.fields)