-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 497211a
Showing
2 changed files
with
130 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# import system module | ||
import sys, re, argparse | ||
import csv | ||
|
||
class Students: | ||
|
||
def __init__(self): | ||
self.id = "" | ||
self.name = "NameNotSet" | ||
self.program = "" | ||
self.netid = "" | ||
self.email = "" | ||
self.section = "" | ||
self.fields = [] | ||
|
||
def add_field(self, s): | ||
# replace & | ||
s = re.sub(r"&", "&", s) | ||
# remove br tag | ||
s = re.sub(r"<br\s*/>", "", s) | ||
self.fields.append(s) | ||
col = len(self.fields) - 1 | ||
# print(col, s) | ||
if col == 1: | ||
self.id = s | ||
elif col == 2: | ||
self.name = s | ||
elif col == 5: | ||
self.program = s | ||
elif col == 8: | ||
self.netid = s | ||
elif col == 10: | ||
self.email = s | ||
elif col == 13: | ||
self.section = s | ||
|
||
def get_fields(self, fdno): | ||
# could provide more options to get customized fields | ||
if len(fdno) == 0: # only a short list | ||
return (self.section, self.id, self.netid, self.name, self.email) | ||
elif len(fdno) == 1 and fdno[0] == 'all': | ||
return self.fields | ||
else: | ||
return [self.fields[int(_)] for _ in fdno] | ||
|
||
def write_csv(file, student_list, fields): | ||
csvwriter = csv.writer(file) | ||
for s in student_list: | ||
csvwriter.writerow(s.get_fields(fields)) | ||
|
||
parser = argparse.ArgumentParser(description='Clean up downloaded roster from HuskyCT') | ||
parser.add_argument('infile', nargs='?', help='Input file.') | ||
parser.add_argument('-o', nargs='?', default='', const='', help='Output file.') | ||
parser.add_argument('-f', nargs='+', default='', help='List of field numbers.') | ||
|
||
args = parser.parse_args() | ||
# print(args) | ||
|
||
student_list = [] | ||
field_num = -1 | ||
student = None | ||
previous = '' | ||
|
||
with open(args.infile, 'r') as file: | ||
num_lines = 0 | ||
for line in file: | ||
num_lines += 1 | ||
|
||
if line.startswith('<tr>') or line.startswith('</table>'): | ||
if student is not None: | ||
student_list.append(student) | ||
field_num = -1 | ||
student = None | ||
|
||
# remove spaces at the end of the line, including newline | ||
line = line.rstrip() | ||
|
||
if line.startswith('<td'): | ||
previous = '' | ||
else: | ||
line = previous + line | ||
|
||
if not line.endswith('</td>'): | ||
previous = line | ||
continue | ||
|
||
# now we have a line that starts with "<td" and ends with "</td>" | ||
m = re.search(r'^<td[^>]*>(.*)</td>', line) | ||
if m: | ||
if student is None: | ||
student = Students() | ||
student.add_field(m.group(1)) | ||
|
||
if args.o != '': | ||
with open(args.o, 'w', newline='') as csvfile: | ||
write_csv(csvfile, student_list, args.f) | ||
else: | ||
write_csv(sys.stdout, student_list, args.f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
## A Python script that cleans up rosters downloaded from HuskyCT | ||
|
||
The downloaded file is actually an HTML file although Excel can | ||
extract tables from it. The format may have changed recently. Now | ||
a student's info is on multiple rows. This script reads a downloaded | ||
file and writes selected columns into a CSV file. | ||
|
||
### Usage | ||
|
||
Assume all files are under the current directory. | ||
|
||
The following command prints info on the screen. | ||
|
||
``` | ||
python cleanup-roster.py downloaded.xls | ||
``` | ||
|
||
The `-o` option specifies an output file. | ||
|
||
``` | ||
python cleanup-roster.py downloaded.xls -o section1.csv | ||
``` | ||
|
||
The `-f` option specifies the fields to be included. `all` | ||
means all fields. Without `-f` option, only a few most useful | ||
fields are printed. | ||
|
||
``` | ||
python cleanup-roster.py downloaded.xls -f 1 2 | ||
python cleanup-roster.py downloaded.xls -f all | ||
``` | ||
|