Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
use python html parser
  • Loading branch information
Jerry Shi committed Dec 24, 2023
1 parent fa06a68 commit 6c0c854
Showing 1 changed file with 57 additions and 36 deletions.
93 changes: 57 additions & 36 deletions cleanup-roster.py
@@ -1,6 +1,7 @@
#!/usr/bin/python3
import sys, argparse, fileinput, logging
import csv, re
from html.parser import HTMLParser

from enum import Enum, auto

Expand All @@ -27,7 +28,7 @@ def is_hot_major(m, checksecond = False):
def get_major_from_program(prog):
# m = re.findall(r"(?: -|/)([^/]+)", prog)
# majors = re.search("_x000D_(.+)", prog)
prog = re.sub(r"&", "&", prog)
# prog = re.sub(r"&", "&", prog)
# print(prog)
majors = re.search(r" -\s*(.+)", prog)
assert majors is not None
Expand All @@ -48,10 +49,48 @@ def get_major_from_program(prog):
logging.debug(f"{m} ==> {major}")
return major

class State(Enum):
START = auto()
HEADER = auto()
ROW = auto()
class MyHTMLParser(HTMLParser):

def __init__(self):
super().__init__()
self.table = []
self.row = []
self.th = False
self.td = False

def handle_starttag(self, tag, attrs):
# print("Start tag:", tag)
# for attr in attrs:
# print(" attr:", attr)

if tag == "tr":
self.end_of_row()
elif tag == "th":
self.th = True
self.data = ""
elif tag == "td":
self.td = True
self.data = ""

def handle_endtag(self, tag):
# print("End tag :", tag)
if tag in ["th", "td"]:
self.row.append(self.data)
self.th = False
self.td = False

if tag == "table":
self.end_of_row()

def handle_data(self, data):
if self.th or self.td:
self.data = data.replace("\n", " ")

def end_of_row(self):
if self.row:
self.table.append(self.row)
self.row = []


class Students:

Expand Down Expand Up @@ -165,38 +204,20 @@ def is_end_of_row(s):

# load from HTML file
def load_html_file(file, students):

state = State.START
try:
with open(file, 'r', encoding='utf-8') as infile:
for line in infile:

if state == State.START:
if is_tr(line):
row = ""
state = State.HEADER

elif state == State.HEADER:
if is_tr(line):
# header
m = re.findall(r'<th[^>]*>(.*?)</th>', row, re.IGNORECASE)
students.add_header(m)

row = ""
state = State.ROW
else:
# remove spaces at the end of the line, including newline
row += line.rstrip()

else: #State.ROW
if is_end_of_row(line):
m = re.findall(r'<td[^>]*>(.*?)</td>', row, re.IGNORECASE)
students.add(m)
logging.debug(m)

row = ""
else:
row += line.rstrip()
parser = MyHTMLParser()
with open(file, 'r', encoding='utf-8') as file:
lines = file.read()

parser.feed(lines)

first = True
for row in parser.table:
if first:
students.add_header(row)
first = False
else:
students.add(row)

except FileNotFoundError as e:
print(e)
Expand Down

0 comments on commit 6c0c854

Please sign in to comment.