From 6c0c854444b242e2aff75b8359abe303305f6fcf Mon Sep 17 00:00:00 2001 From: Jerry Shi Date: Sun, 24 Dec 2023 07:08:53 -0500 Subject: [PATCH] use python html parser --- cleanup-roster.py | 93 +++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/cleanup-roster.py b/cleanup-roster.py index 73a8e04..8d1e672 100644 --- a/cleanup-roster.py +++ b/cleanup-roster.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 import sys, argparse, fileinput, logging import csv, re +from html.parser import HTMLParser from enum import Enum, auto @@ -27,7 +28,7 @@ def is_hot_major(m, checksecond = False): def get_major_from_program(prog): # m = re.findall(r"(?: -|/)([^/]+)", prog) # majors = re.search("_x000D_(.+)", prog) - prog = re.sub(r"&", "&", prog) + # prog = re.sub(r"&", "&", prog) # print(prog) majors = re.search(r" -\s*(.+)", prog) assert majors is not None @@ -48,10 +49,48 @@ def get_major_from_program(prog): logging.debug(f"{m} ==> {major}") return major -class State(Enum): - START = auto() - HEADER = auto() - ROW = auto() +class MyHTMLParser(HTMLParser): + + def __init__(self): + super().__init__() + self.table = [] + self.row = [] + self.th = False + self.td = False + + def handle_starttag(self, tag, attrs): + # print("Start tag:", tag) + # for attr in attrs: + # print(" attr:", attr) + + if tag == "tr": + self.end_of_row() + elif tag == "th": + self.th = True + self.data = "" + elif tag == "td": + self.td = True + self.data = "" + + def handle_endtag(self, tag): + # print("End tag :", tag) + if tag in ["th", "td"]: + self.row.append(self.data) + self.th = False + self.td = False + + if tag == "table": + self.end_of_row() + + def handle_data(self, data): + if self.th or self.td: + self.data = data.replace("\n", " ") + + def end_of_row(self): + if self.row: + self.table.append(self.row) + self.row = [] + class Students: @@ -165,38 +204,20 @@ def is_end_of_row(s): # load from HTML file def load_html_file(file, students): - - state = State.START try: - with open(file, 'r', encoding='utf-8') as infile: - for line in infile: - - if state == State.START: - if is_tr(line): - row = "" - state = State.HEADER - - elif state == State.HEADER: - if is_tr(line): - # header - m = re.findall(r']*>(.*?)', row, re.IGNORECASE) - students.add_header(m) - - row = "" - state = State.ROW - else: - # remove spaces at the end of the line, including newline - row += line.rstrip() - - else: #State.ROW - if is_end_of_row(line): - m = re.findall(r']*>(.*?)', row, re.IGNORECASE) - students.add(m) - logging.debug(m) - - row = "" - else: - row += line.rstrip() + parser = MyHTMLParser() + with open(file, 'r', encoding='utf-8') as file: + lines = file.read() + + parser.feed(lines) + + first = True + for row in parser.table: + if first: + students.add_header(row) + first = False + else: + students.add(row) except FileNotFoundError as e: print(e)