Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
zhs04001 committed Sep 8, 2021
0 parents commit 497211a
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 0 deletions.
98 changes: 98 additions & 0 deletions cleanup-roster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# import system module
import sys, re, argparse
import csv

class Students:

def __init__(self):
self.id = ""
self.name = "NameNotSet"
self.program = ""
self.netid = ""
self.email = ""
self.section = ""
self.fields = []

def add_field(self, s):
# replace &
s = re.sub(r"&", "&", s)
# remove br tag
s = re.sub(r"<br\s*/>", "", s)
self.fields.append(s)
col = len(self.fields) - 1
# print(col, s)
if col == 1:
self.id = s
elif col == 2:
self.name = s
elif col == 5:
self.program = s
elif col == 8:
self.netid = s
elif col == 10:
self.email = s
elif col == 13:
self.section = s

def get_fields(self, fdno):
# could provide more options to get customized fields
if len(fdno) == 0: # only a short list
return (self.section, self.id, self.netid, self.name, self.email)
elif len(fdno) == 1 and fdno[0] == 'all':
return self.fields
else:
return [self.fields[int(_)] for _ in fdno]

def write_csv(file, student_list, fields):
csvwriter = csv.writer(file)
for s in student_list:
csvwriter.writerow(s.get_fields(fields))

parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT')
parser.add_argument('infile', nargs='?', help='Input file.')
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.')
parser.add_argument('-f', nargs='+', default='', help='List of field numbers.')

args = parser.parse_args()
# print(args)

student_list = []
field_num = -1
student = None
previous = ''

with open(args.infile, 'r') as file:
num_lines = 0
for line in file:
num_lines += 1

if line.startswith('<tr>') or line.startswith('</table>'):
if student is not None:
student_list.append(student)
field_num = -1
student = None

# remove spaces at the end of the line, including newline
line = line.rstrip()

if line.startswith('<td'):
previous = ''
else:
line = previous + line

if not line.endswith('</td>'):
previous = line
continue

# now we have a line that starts with "<td" and ends with "</td>"
m = re.search(r'^<td[^>]*>(.*)</td>', line)
if m:
if student is None:
student = Students()
student.add_field(m.group(1))

if args.o != '':
with open(args.o, 'w', newline='') as csvfile:
write_csv(csvfile, student_list, args.f)
else:
write_csv(sys.stdout, student_list, args.f)
32 changes: 32 additions & 0 deletions readme.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
## A Python script that cleans up rosters downloaded from HuskyCT

The downloaded file is actually an HTML file although Excel can
extract tables from it. The format may have changed recently. Now
a student's info is on multiple rows. This script reads a downloaded
file and writes selected columns into a CSV file.

### Usage

Assume all files are under the current directory.

The following command prints info on the screen.

```
python cleanup-roster.py downloaded.xls
```

The `-o` option specifies an output file.

```
python cleanup-roster.py downloaded.xls -o section1.csv
```

The `-f` option specifies the fields to be included. `all`
means all fields. Without `-f` option, only a few most useful
fields are printed.

```
python cleanup-roster.py downloaded.xls -f 1 2
python cleanup-roster.py downloaded.xls -f all
```

0 comments on commit 497211a

Please sign in to comment.