Skip to content

Commit

Permalink
refactor code. add major info
Browse files Browse the repository at this point in the history
  • Loading branch information
zhs04001 committed Nov 16, 2022
1 parent 603d406 commit c863cf9
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 99 deletions.
254 changes: 162 additions & 92 deletions cleanup-roster.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,187 @@
import sys, re, argparse
import csv
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from enum import Enum, auto

def is_hot_major(m, checksecond = False):
if checksecond:
# some records are appended " as Second" and some " Second"
if not m.endswith("Second"):
return ""

if m.startswith("Computer Science & Engineering"):
major = "CSE"
elif m.startswith("Computer Science"):
major = "CS"
elif m.startswith("Computer Engineering"):
major = "CompE"
elif m.startswith("Electrical Engineering"):
major = "EE"
else:
if m.startswith("Computer"):
logging.warning(f"Program '{m}' starts with Computer.")
major = ""
return major

def get_major_from_program(prog):
m = re.findall(r"(?: -|/)([^/]+)", prog)

assert len(m) > 0

major = is_hot_major(m[0])

if major == "":
if len(m) > 1:
for p in m[1:]:
major = is_hot_major(p, True)
if major:
break
if major == "":
major = m[0]
logging.debug(f"{m} ==> {major}")
return major

class Student:
class State(Enum):
START = auto()
ROW = auto()

class Students:

def __init__(self):
self.id = ""
self.name = "NameNotSet"
self.program = ""
self.netid = ""
self.email = ""
self.section = ""
self.fields = []

def add_field(self, s):
self.name_to_idx = {}
self.student_list = []
self.field_list = []
self.output_fields = ["section", "id", "netid", "name", "email"]

def add_header(self, line):
if len(self.name_to_idx):
return
m = re.findall(r'<th[^>]*>(.*?)</th>', line, re.IGNORECASE)
idx = 0
self.field_list = []
for f in m:
shortname = f.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
logging.info("Field names are: " + ','.join(self.field_list))

def add(self, line):
# replace &amp;
s = re.sub(r"&amp;", "&", s)
logging.debug(line)
line1 = re.sub(r"&amp;", "&", line)
# remove br tag
s = re.sub(r"<br\s*/>", "", s)
self.fields.append(s)
col = len(self.fields) - 1
# print(col, s)
if col == 1:
self.id = s
elif col == 2:
self.name = s
elif col == 5:
self.program = s
elif col == 8:
self.netid = s
elif col == 10:
self.email = s
elif col == 13:
self.section = s

def get_fields(self, fdno):
# could provide more options to get customized fields
if len(fdno) == 0: # only a short list
return (self.section, self.id, self.netid, self.name, self.email)
elif len(fdno) == 1 and fdno[0] == 'all':
return self.fields
# s = re.sub(r"<br\s*/>", "", s)
m = re.findall(r'<td[^>]*>(.*?)</td>', line1, re.IGNORECASE)
if m:
if len(m) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
self.student_list.append(m)
else:
return [self.fields[int(_)] for _ in fdno]

def write_csv(file, student_list, fields, nl = None):
self.add_header(line1)

def __iter__(self):
self.idx = 0
return self

def __next__(self):
if self.idx >= len(self.student_list):
raise StopIteration
# print(self.idx, self.student_list[self.idx])
record = [ self.student_list[self.idx][self.name_to_idx[f]] for f in self.output_fields ]
self.idx += 1
return record

def __str__(self):
# show all fields
return '\n'.join([ str(s) for s in self.student_list])

def set_major(self):
idx = self.name_to_idx['program']
self.name_to_idx['major'] = len(self.field_list)
self.field_list.append('major')
for s in self.student_list:
s.append(get_major_from_program(s[idx]))

def set_fields(self, fsel):
if len(fsel) == 0:
return

# output all fields
if len(fsel) == 1 and fsel[0] == 'all':
self.output_fields = list(self.field_list)
return

self.output_fields = []
self.add_fields(fsel)

def add_fields(self, fields):
for f in [ f.lower() for f in fields]:
assert f in self.field_list
self.output_fields.append(f)

def write_csv(file, students, nl = None):
if nl is None:
csvwriter = csv.writer(file)
else:
csvwriter = csv.writer(file, lineterminator=nl)
for s in student_list:
csvwriter.writerow(s.get_fields(fields))
for s in students:
csvwriter.writerow(s)

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
parser.add_argument('infile', help='Input file.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default='', help='List of field numbers.')
parser.add_argument("-a", action='store_true', default=False, help='Append to the output file.')
parser.add_argument("-v", action='store_true', default=False, help='Verbose.')
def is_tr(s):
return s.startswith('<tr')

args = parser.parse_args()
if args.v:
print(args)
def is_end_of_row(s):
return s.startswith('<tr>') or s.startswith('</table>')

student_list = []
student = None
previous = ''
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. Nov 2022.')
parser.add_argument('infiles', nargs='*', default=[], help='Input files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default=[], help='Add the list of fields to the default list')
parser.add_argument('--fields', nargs='+', default=[], help='Set the list of fields to print.')
parser.add_argument("-v", action='count', default=0, help='Verbose level.')

try:
with open(args.infile, 'r') as file:
num_lines = 0
for line in file:
num_lines += 1

if line.startswith('<tr>') or line.startswith('</table>'):
if student is not None:
student_list.append(student)
student = None
previous = ''
continue
args = parser.parse_args()

# remove spaces at the end of the line, including newline
line = line.rstrip()
if args.v == 1:
logging.basicConfig(level=logging.INFO)
elif args.v > 1:
logging.basicConfig(level=logging.DEBUG)

logging.debug(args)

# append to previous line, if there is any
if previous:
line = previous + line
students = Students()

if not line.startswith("<td"):
# ignore lines that do not indicate a new field
assert(previous == '')
state = State.START
row = ''
try:
for line in fileinput.input(args.infiles):
# remove spaces at the end of the line, including newline
line = line.rstrip()

if state == State.ROW:
if is_end_of_row(line):
students.add(row)
state = State.START
else:
row += line
continue

# print(line)

# now we check if the line starts with "<td" and has a "</td>"
m = re.match(r'<td[^>]*>(.*)</td>', line)
if m:
if student is None:
student = Student()
student.add_field(m.group(1))
previous = ''
else:
# if the line does not have the ending tag, more lines are needed
previous = line
# looking for a row
if is_tr(line):
row = line
state = State.ROW

except FileNotFoundError as e:
print(e)
exit(1)

students.set_major()
students.set_fields(args.fields)
students.add_fields(args.f)

if args.o != '':
flag = 'w'
if args.a:
flag = 'a+'
with open(args.o, flag, newline='') as csvfile:
write_csv(csvfile, student_list, args.f)
with open(args.o, 'w', newline='') as csvfile:
write_csv(csvfile, students)
else:
write_csv(sys.stdout, student_list, args.f, '\n')
write_csv(sys.stdout, students, '\n')
38 changes: 31 additions & 7 deletions readme.MD
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,47 @@ The `-o` option specifies an output file.
python cleanup-roster.py downloaded.xls -o section1.csv
```

The `-a` option specifies the append mode so the output file is not cleared.
Multiple `xls` files can be specified.

The `-f` option specifies the fields to be included. `all`
means all fields. Without `-f` option, only a few most useful
fields are printed.
The script does not output all fields. The `-f` option specifies the additonal
fields, one or more, to be included. For example, the following command adds
enrollment date and program fields to the output.

python cleanup-roster.py downloaded.xls -f enrollment program

`--fields` option specifies a full list of fields to be printed. `all`
means all fields.

```
python cleanup-roster.py downloaded.xls -f 1 2
python cleanup-roster.py downloaded.xls -f all
python cleanup-roster.py downloaded.xls --fields name netid
python cleanup-roster.py downloaded.xls --fields all
```

In Powershell, the following commands process multiple xls files.

```
# appends all student rows to all.csv
dir *.xls | foreach { py cleanup-roster.py $_ -a -o all.csv }
cat *.xls | py cleanup-roster.py -o all.csv
# save student records in separate csv files
dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
```

## Notes

### Nov 2022

The format of downloaded files keeps changing, mainly in "Program and Plan" field.
The format in Nov 2022 is:

<school> - <\r><major>
<school> - <\r><major>/<minor>
<school> - <\r><major>/<minor1/>/<minor2>
<school> - <\r><major>/<major2> as Second
<school> - <\r><major>/<major2> Second

The school can be :

Engineering
Liberal Arts & Sciences
CCS Non-Degree

0 comments on commit c863cf9

Please sign in to comment.