diff --git a/cleanup-roster.py b/cleanup-roster.py
index 9efc0e5..89ab14e 100644
--- a/cleanup-roster.py
+++ b/cleanup-roster.py
@@ -5,9 +5,9 @@
def is_hot_major(m, checksecond = False):
if checksecond:
+ # some records are appended " as Second" and some " Second"
if not m.endswith("Second"):
return ""
- m = m[-10:]
if m.startswith("Computer Science & Engineering"):
major = "CSE"
@@ -24,8 +24,9 @@ def is_hot_major(m, checksecond = False):
return major
def get_major_from_program(prog):
- m = re.findall(r"[-/]([^/]+)", prog)
+ m = re.findall(r"(?: -|/)([^/]+)", prog)
+ print(m)
assert len(m) > 0
major = is_hot_major(m[0])
@@ -68,6 +69,7 @@ def add_header(self, line):
def add(self, line):
# replace &
+ logging.debug(line)
line1 = re.sub(r"&", "&", line)
# remove br tag
# s = re.sub(r"
", "", s)
@@ -134,7 +136,7 @@ def is_tr(s):
def is_end_of_row(s):
return s.startswith('
') or s.startswith('')
-parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
+parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
@@ -155,8 +157,7 @@ def is_end_of_row(s):
state = State.START
row = ''
try:
-# for line in fileinput.input(args.infiles):
- for line in fileinput.input("ps.xls"):
+ for line in fileinput.input(args.infiles):
# remove spaces at the end of the line, including newline
line = line.rstrip()
diff --git a/readme.MD b/readme.MD
index fc36fca..fda41e3 100644
--- a/readme.MD
+++ b/readme.MD
@@ -46,3 +46,22 @@ cat *.xls | py cleanup-roster.py -o all.csv
# save student records in separate csv files
dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
```
+
+## Notes
+
+### Nov 2022
+
+The format of downloaded files keeps changing, mainly in "Program and Plan" field.
+The format in Nov 2022 is:
+
+ - <\r>
+ - <\r>/
+ - <\r>//
+ - <\r>/ as Second
+ - <\r>/ Second
+
+The school can be :
+
+ Engineering
+ Liberal Arts & Sciences
+ CCS Non-Degree