Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
support html table in cleanup-roster.py
  • Loading branch information
Jerry Shi committed Dec 23, 2023
1 parent 72d25ef commit fa06a68
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 17 deletions.
71 changes: 61 additions & 10 deletions cleanup-roster.py
@@ -1,7 +1,6 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
import openpyxl

from enum import Enum, auto

Expand All @@ -28,8 +27,9 @@ def is_hot_major(m, checksecond = False):
def get_major_from_program(prog):
# m = re.findall(r"(?: -|/)([^/]+)", prog)
# majors = re.search("_x000D_(.+)", prog)
majors = re.search(r" -\s+(.+)", prog)
prog = re.sub(r"&", "&", prog)
# print(prog)
majors = re.search(r" -\s*(.+)", prog)
assert majors is not None
# print(majors.group())

Expand All @@ -50,6 +50,7 @@ def get_major_from_program(prog):

class State(Enum):
START = auto()
HEADER = auto()
ROW = auto()

class Students:
Expand All @@ -67,7 +68,7 @@ class Students:
self.field_list = []
for cell in row:
# short name is only the first word
shortname = cell.value.split(maxsplit=1)[0].lower()
shortname = cell.split(maxsplit=1)[0].lower()
self.name_to_idx[shortname] = idx
self.field_list.append(shortname)
idx += 1
Expand All @@ -79,9 +80,9 @@ class Students:

if len(row) != len(self.field_list):
logging.warning("The number of fields does not seem correct: "
+ ','.join(m))
m = [ c.value for c in row ]
self.student_list.append(m)
+ ','.join(row))
self.student_list.append(row)
# print(len(self.student_list), row)

def get_field_list(self):
return self.field_list
Expand Down Expand Up @@ -134,24 +135,74 @@ def write_csv(file, students, nl = None):
for s in students:
csvwriter.writerow(s)

def load_file(file, students):
# load from real xlsx files.
def load_file_xlsx(file, students):
import openpyxl
try:
wb = openpyxl.load_workbook(file, data_only=True)
worksheet = wb.worksheets[0]

first = True
for row in worksheet.rows:
m = [ c.value for c in row ]
if first:
students.add_header(row)
students.add_header(m)
first = False
else:
students.add(row)
students.add(m)

except Exception as e:
print(e)
return False
return True

### Helper functions
def is_tr(s):
return s.startswith('<tr')

def is_end_of_row(s):
return s.startswith('<tr') or s.startswith('</table>')

# load from HTML file
def load_html_file(file, students):

state = State.START
try:
with open(file, 'r', encoding='utf-8') as infile:
for line in infile:

if state == State.START:
if is_tr(line):
row = ""
state = State.HEADER

elif state == State.HEADER:
if is_tr(line):
# header
m = re.findall(r'<th[^>]*>(.*?)</th>', row, re.IGNORECASE)
students.add_header(m)

row = ""
state = State.ROW
else:
# remove spaces at the end of the line, including newline
row += line.rstrip()

else: #State.ROW
if is_end_of_row(line):
m = re.findall(r'<td[^>]*>(.*?)</td>', row, re.IGNORECASE)
students.add(m)
logging.debug(m)

row = ""
else:
row += line.rstrip()

except FileNotFoundError as e:
print(e)
return False
return True

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT. April 2023.')
parser.add_argument('infiles', nargs='+', default=[], help='Input files. Can have multiple files.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
Expand All @@ -173,7 +224,7 @@ students = Students()

for file in args.infiles:
logging.info(f"Loading from {file}...")
if not load_file(file, students):
if not load_html_file(file, students):
exit(1)

if args.listfields:
Expand Down
26 changes: 19 additions & 7 deletions readme.MD
@@ -1,9 +1,8 @@
## A Python script that cleans up rosters downloaded from HuskyCT

The downloaded file is actually an HTML file although Excel can
extract tables from it. The format may have changed recently. Now
a student's info is on multiple rows. This script reads a downloaded
file and writes selected columns into a CSV file.
This script reads downloaded roster files and writes selected columns/fields
into a CSV file. If the output filename is not specified, the lines are
printed to the standard output, which can be redirected into a file.

### Usage

Expand Down Expand Up @@ -40,15 +39,28 @@ python cleanup-roster.py downloaded.xls --fields all
In Powershell, the following commands process multiple xls files.

```
# appends all student rows to all.csv
cat *.xls | py cleanup-roster.py -o all.csv
# write all students in multiple files to single output file all.csv
py cleanup-roster.py $(dir *.xls) -o all.csv
# save student records in separate csv files
dir *.xls | foreach { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
dir *.xls | foreach-object { py cleanup-roster.py $_ -o ($_.name -replace '.xls','.csv') }
```

## Notes

### Dec 2023

The downloaded files are actually HTML files and information is in an HTML
table. Excel can open it, after a warning. The format of the file changed
several times. Currently, the "Program and Plan" field is in a single table
cell. For example,

<td >Engineering -
^MComputer Science</td>

Change the script to read downloaded file directly, instead of reading XLSX
files.

### Nov 2022

The format of downloaded files keeps changing, mainly in "Program and Plan" field.
Expand Down

0 comments on commit fa06a68

Please sign in to comment.