diff --git a/bin/lesson_check.py b/bin/lesson_check.py index f858ebd..c123984 100755 --- a/bin/lesson_check.py +++ b/bin/lesson_check.py @@ -8,7 +8,6 @@ import os import glob import json -import yaml import re from optparse import OptionParser @@ -38,12 +37,6 @@ '%/setup.md': True, } -# Required non-Markdown files. -NON_MARKDOWN_FILES = { - "AUTHORS", - "CITATION" -} - # Episode filename pattern. P_EPISODE_FILENAME = re.compile(r'/_episodes/(\d\d)-[-\w]+.md$') @@ -97,10 +90,9 @@ def main(): """Main driver.""" args = parse_args() - args.reporter = Reporter(args) - check_config(args) - check_non_markdown_files(args.source_dir, args.reporter) - docs = read_all_markdown(args, args.source_dir) + args.reporter = Reporter() + check_config(args.reporter, args.source_dir) + docs = read_all_markdown(args.source_dir, args.parser) check_fileset(args.source_dir, args.reporter, docs.keys()) for filename in docs.keys(): checker = create_checker(args, filename, docs[filename]) @@ -134,27 +126,15 @@ def parse_args(): return args -def check_config(args): +def check_config(reporter, source_dir): """Check configuration file.""" - config_file = os.path.join(args.source_dir, '_config.yml') - with open(config_file, 'r') as reader: - config = yaml.load(reader) - - args.reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson') - - -def check_non_markdown_files(source_dir, reporter): - """Check presence of non-Markdown files.""" + config_file = os.path.join(source_dir, '_config.yml') + config = load_yaml(config_file) + reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson') - for filename in NON_MARKDOWN_FILES: - path = os.path.join(source_dir, filename) - reporter.check(os.path.exists(path), - filename, - "File not found") - -def read_all_markdown(args, source_dir): +def read_all_markdown(source_dir, parser): """Read source files, returning {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}} """ @@ -164,7 +144,7 @@ def read_all_markdown(args, source_dir): result = {} for pat in all_patterns: for filename in glob.glob(pat): - data = read_markdown(args.parser, filename) + data = read_markdown(parser, filename) if data: result[filename] = data return result @@ -192,9 +172,9 @@ def check_fileset(source_dir, reporter, filenames_present): # Check for duplicate episode numbers. reporter.check(len(seen) == len(set(seen)), - None, - 'Duplicate episode numbers {0} vs {1}', - sorted(seen), sorted(set(seen))) + None, + 'Duplicate episode numbers {0} vs {1}', + sorted(seen), sorted(set(seen))) # Check that numbers are consecutive. seen = [int(s) for s in seen] @@ -239,6 +219,7 @@ def __init__(self, args, filename, metadata, metadata_len, text, lines, doc): self.text = text self.lines = lines self.doc = doc + self.layout = None @@ -371,7 +352,6 @@ class CheckEpisode(CheckBase): def __init__(self, args, filename, metadata, metadata_len, text, lines, doc): super(CheckEpisode, self).__init__(args, filename, metadata, metadata_len, text, lines, doc) - def check_metadata(self): super(CheckEpisode, self).check_metadata() if self.metadata: diff --git a/bin/util.py b/bin/util.py index b2d66dc..6af0a33 100644 --- a/bin/util.py +++ b/bin/util.py @@ -1,13 +1,17 @@ import sys import json -import yaml from subprocess import Popen, PIPE +try: + import yaml +except ImportError: + print('Unable to import YAML module: please install PyYAML', file=sys.stderr) + sys.exit(1) class Reporter(object): """Collect and report errors.""" - def __init__(self, args): + def __init__(self): """Constructor.""" super(Reporter, self).__init__() @@ -62,23 +66,13 @@ def read_markdown(parser, path): """ # Split and extract YAML (if present). - metadata = None - metadata_len = None with open(path, 'r') as reader: body = reader.read() - pieces = body.split('---', 2) - if len(pieces) == 3: - try: - metadata = yaml.load(pieces[1]) - except yaml.YAMLError as e: - print('Unable to parse YAML header in {0}:\n{1}'.format(path, e)) - sys.exit(1) - metadata_len = pieces[1].count('\n') - body = pieces[2] + metadata_raw, metadata_yaml, body = split_metadata(path, body) # Split into lines. - offset = 0 if metadata_len is None else metadata_len - lines = [(offset+i+1, l, len(l)) for (i, l) in enumerate(body.split('\n'))] + metadata_len = 0 if metadata_raw is None else metadata_raw.count('\n') + lines = [(metadata_len+i+1, line, len(line)) for (i, line) in enumerate(body.split('\n'))] # Parse Markdown. cmd = 'ruby {0}'.format(parser) @@ -87,9 +81,42 @@ def read_markdown(parser, path): doc = json.loads(stdout_data) return { - 'metadata': metadata, + 'metadata': metadata_yaml, 'metadata_len': metadata_len, 'text': body, 'lines': lines, 'doc': doc } + + +def split_metadata(path, text): + """ + Get raw (text) metadata, metadata as YAML, and rest of body. + If no metadata, return (None, None, body). + """ + + metadata_raw = None + metadata_yaml = None + metadata_len = None + + pieces = text.split('---', 2) + if len(pieces) == 3: + metadata_raw = pieces[1] + text = pieces[2] + try: + metadata_yaml = yaml.load(metadata_raw) + except yaml.YAMLError as e: + print('Unable to parse YAML header in {0}:\n{1}'.format(path, e), file=sys.stderr) + sys.exit(1) + + return metadata_raw, metadata_yaml, text + + +def load_yaml(filename): + """ + Wrapper around YAML loading so that 'import yaml' and error + handling is only needed in one place. + """ + + with open(filename, 'r') as reader: + return yaml.load(reader) diff --git a/bin/workshop_check.py b/bin/workshop_check.py index 4c863ca..d018b78 100755 --- a/bin/workshop_check.py +++ b/bin/workshop_check.py @@ -7,37 +7,21 @@ import sys import os import re -import logging -import yaml -from collections import Counter +from datetime import date +from util import Reporter, split_metadata -__version__ = '0.6' - -# basic logging configuration -logger = logging.getLogger(__name__) -verbosity = logging.INFO # severity of at least INFO will emerge -logger.setLevel(verbosity) - -# create console handler and set level to debug -console_handler = logging.StreamHandler() -console_handler.setLevel(verbosity) - -formatter = logging.Formatter('%(levelname)s: %(message)s') -console_handler.setFormatter(formatter) -logger.addHandler(console_handler) - - -# TODO: these regexp patterns need comments inside +# Metadata field patterns. EMAIL_PATTERN = r'[^@]+@[^@]+\.[^@]+' HUMANTIME_PATTERN = r'((0?[1-9]|1[0-2]):[0-5]\d(am|pm)(-|to)(0?[1-9]|1[0-2]):[0-5]\d(am|pm))|((0?\d|1\d|2[0-3]):[0-5]\d(-|to)(0?\d|1\d|2[0-3]):[0-5]\d)' EVENTBRITE_PATTERN = r'\d{9,10}' URL_PATTERN = r'https?://.+' +# Defaults. CARPENTRIES = ("dc", "swc") DEFAULT_CONTACT_EMAIL = 'admin@software-carpentry.org' -USAGE = 'Usage: "check-workshop path/to/index.html"\n' +USAGE = 'Usage: "check-workshop path/to/root/directory"' # Country and language codes. Note that codes mean different things: 'ar' # is 'Arabic' as a language but 'Argentina' as a country. @@ -86,18 +70,9 @@ ] -def add_error(msg, errors): - """Add error to the list of errors.""" - errors.append(msg) - - -def add_suberror(msg, errors): - """Add sub error, ie. error indented by 1 level ("\t"), to the list of errors.""" - errors.append("\t{0}".format(msg)) - - def look_for_fixme(func): - '''Decorator to fail test if text argument starts with "FIXME".''' + """Decorator to fail test if text argument starts with "FIXME".""" + def inner(arg): if (arg is not None) and \ isinstance(arg, str) and \ @@ -137,24 +112,26 @@ def check_language(language): @look_for_fixme def check_humandate(date): - '''"humandate" must be a human-readable date with a 3-letter month and - 4-digit year. Examples include "Feb 18-20, 2025" and "Feb 18 and - 20, 2025". It may be in languages other than English, but the + """ + 'humandate' must be a human-readable date with a 3-letter month + and 4-digit year. Examples include 'Feb 18-20, 2025' and 'Feb 18 + and 20, 2025'. It may be in languages other than English, but the month name should be kept short to aid formatting of the main - Software Carpentry web site.''' + Software Carpentry web site. + """ - if "," not in date: + if ',' not in date: return False - month_dates, year = date.split(",") + month_dates, year = date.split(',') # The first three characters of month_dates are not empty month = month_dates[:3] - if any(char == " " for char in month): + if any(char == ' ' for char in month): return False # But the fourth character is empty ("February" is illegal) - if month_dates[3] != " ": + if month_dates[3] != ' ': return False # year contains *only* numbers @@ -168,65 +145,80 @@ def check_humandate(date): @look_for_fixme def check_humantime(time): - '''"humantime" is a human-readable start and end time for the workshop, - such as "09:00 - 16:00".''' + """ + 'humantime' is a human-readable start and end time for the + workshop, such as '09:00 - 16:00'. + """ - return bool(re.match(HUMANTIME_PATTERN, time.replace(" ", ""))) + return bool(re.match(HUMANTIME_PATTERN, time.replace(' ', ''))) def check_date(this_date): - '''"startdate" and "enddate" are machine-readable start and end dates for - the workshop, and must be in YYYY-MM-DD format, e.g., "2015-07-01".''' + """ + 'startdate' and 'enddate' are machine-readable start and end dates + for the workshop, and must be in YYYY-MM-DD format, e.g., + '2015-07-01'. + """ - from datetime import date - # yaml automatically loads valid dates as datetime.date + # YAML automatically loads valid dates as datetime.date. return isinstance(this_date, date) @look_for_fixme def check_latitude_longitude(latlng): - '''"latlng" must be a valid latitude and longitude represented as two - floating-point numbers separated by a comma.''' + """ + 'latlng' must be a valid latitude and longitude represented as two + floating-point numbers separated by a comma. + """ try: lat, lng = latlng.split(',') lat = float(lat) long = float(lng) + return (-90.0 <= lat <= 90.0) and (-180.0 <= long <= 180.0) except ValueError: return False - return (-90.0 <= lat <= 90.0) and (-180.0 <= long <= 180.0) def check_instructors(instructors): - '''"instructor" must be a non-empty comma-separated list of quoted names, - e.g. ['First name', 'Second name', ...']. Do not use "TBD" or other - placeholders.''' + """ + 'instructor' must be a non-empty comma-separated list of quoted + names, e.g. ['First name', 'Second name', ...']. Do not use 'TBD' + or other placeholders. + """ - # yaml automatically loads list-like strings as lists + # YAML automatically loads list-like strings as lists. return isinstance(instructors, list) and len(instructors) > 0 def check_helpers(helpers): - '''"helper" must be a comma-separated list of quoted names, - e.g. ['First name', 'Second name', ...']. The list may be empty. Do - not use "TBD" or other placeholders.''' + """ + 'helper' must be a comma-separated list of quoted names, + e.g. ['First name', 'Second name', ...']. The list may be empty. + Do not use 'TBD' or other placeholders. + """ - # yaml automatically loads list-like strings as lists + # YAML automatically loads list-like strings as lists. return isinstance(helpers, list) and len(helpers) >= 0 @look_for_fixme def check_email(email): - '''"contact" must be a valid email address consisting of characters, a - @, and more characters. It should not be the default contact - email address "admin@software-carpentry.org".''' + """ + 'contact' must be a valid email address consisting of characters, + an '@', and more characters. It should not be the default contact + email address 'admin@software-carpentry.org'. + """ return bool(re.match(EMAIL_PATTERN, email)) and \ (email != DEFAULT_CONTACT_EMAIL) def check_eventbrite(eventbrite): - '''"eventbrite" (the Eventbrite registration key) must be 9 or more digits.''' + """ + 'eventbrite' (the Eventbrite registration key) must be 9 or more + digits. It may appear as an integer or as a string. + """ if isinstance(eventbrite, int): return True @@ -236,15 +228,19 @@ def check_eventbrite(eventbrite): @look_for_fixme def check_etherpad(etherpad): - '''"etherpad" must be a valid URL.''' + """ + 'etherpad' must be a valid URL. + """ return bool(re.match(URL_PATTERN, etherpad)) @look_for_fixme def check_pass(value): - '''This test always passes (it is used for "checking" things like - addresses, for which no sensible validation is feasible).''' + """ + This test always passes (it is used for 'checking' things like the + workshop address, for which no sensible validation is feasible). + """ return True @@ -265,38 +261,38 @@ def check_pass(value): 'humandate': (True, check_humandate, 'humandate invalid. Please use three-letter months like ' + - '"Jan" and four-letter years like "2025".'), + '"Jan" and four-letter years like "2025"'), 'humantime': (True, check_humantime, 'humantime doesn\'t include numbers'), 'startdate': (True, check_date, 'startdate invalid. Must be of format year-month-day, ' + - 'i.e., 2014-01-31.'), + 'i.e., 2014-01-31'), 'enddate': (False, check_date, 'enddate invalid. Must be of format year-month-day, i.e.,' + - ' 2014-01-31.'), + ' 2014-01-31'), 'latlng': (True, check_latitude_longitude, 'latlng invalid. Check that it is two floating point ' + - 'numbers, separated by a comma.'), + 'numbers, separated by a comma'), 'instructor': (True, check_instructors, 'instructor list isn\'t a valid list of format ' + - '["First instructor", "Second instructor",..].'), + '["First instructor", "Second instructor",..]'), 'helper': (True, check_helpers, 'helper list isn\'t a valid list of format ' + - '["First helper", "Second helper",..].'), + '["First helper", "Second helper",..]'), 'contact': (True, check_email, 'contact email invalid or still set to ' + '"{0}".'.format(DEFAULT_CONTACT_EMAIL)), - 'eventbrite': (False, check_eventbrite, 'Eventbrite key appears invalid.'), + 'eventbrite': (False, check_eventbrite, 'Eventbrite key appears invalid'), - 'etherpad': (False, check_etherpad, 'Etherpad URL appears invalid.'), + 'etherpad': (False, check_etherpad, 'Etherpad URL appears invalid'), 'venue': (False, check_pass, 'venue name not specified'), @@ -310,102 +306,85 @@ def check_pass(value): OPTIONAL = set([k for k in HANDLERS if not HANDLERS[k][0]]) -def check_validity(data, function, errors, error_msg): - '''Wrapper-function around the various check-functions.''' - valid = function(data) - if not valid: - add_error(error_msg, errors) - add_suberror('Offending entry is: "{0}"'.format(data), errors) - return valid +def check_blank_lines(reporter, raw): + """ + Blank lines are not allowed in category headers. + """ + lines = [(i, x) for (i, x) in enumerate(raw.strip().split('\n')) if not x.strip()] + reporter.check(not lines, + None, + 'Blank line(s) in header: {0}', + ', '.join(["{0}: {1}".format(i, x.rstrip()) for (i, x) in lines])) -def check_blank_lines(raw_data, errors, error_msg): - '''Blank lines are not allowed in category headers.''' - lines = [x.strip() for x in raw_data.split('\n')] - if '' in lines: - add_error(error_msg, errors) - add_suberror('{0} blank lines found in header'.format(lines.count('')), errors) - return False - return True +def check_categories(reporter, left, right, msg): + """ + Report differences (if any) between two sets of categories. + """ -def check_categories(left, right, errors, error_msg): - '''Report set difference of categories.''' - result = left - right - if result: - add_error(error_msg, errors) - add_suberror('Offending entries: {0}'.format(result), errors) - return False - return True - - -def get_header(text): - '''Extract YAML header from raw data, returning (None, None) if no - valid header found and (raw, parsed) if header found.''' - - # YAML header must be right at the start of the file. - if not text.startswith('---'): - return None, None - - # YAML header must start and end with '---' - pieces = text.split('---') - if len(pieces) < 3: - return None, None + diff = left - right + reporter.check(len(diff) == 0, + None, + '{0}: offending entries {1}', + msg, sorted(list(diff))) - # Return raw text and YAML-ized form. - raw = pieces[1].strip() - return raw, yaml.load(raw) +def check_file(reporter, path, data): + """ + Get header from file, call all other functions, and check file for + validity. + """ -def check_file(filename, data, errors): - '''Get header from file, call all other functions and check file - for validity. Return list of errors (empty when no errors).''' - - raw, header = get_header(data) - if header is None: - msg = ('Cannot find YAML header in given file "{0}".'.format(filename)) - add_error(msg, errors) - return errors + # Get metadata as text and as YAML. + raw, header, body = split_metadata(path, data) # Do we have any blank lines in the header? - is_valid = check_blank_lines(raw, errors, - 'There are blank lines in the header') + check_blank_lines(reporter, raw) # Look through all header entries. If the category is in the input # file and is either required or we have actual data (as opposed to # a commented-out entry), we check it. If it *isn't* in the header # but is required, report an error. for category in HANDLERS: - required, handler_function, error_message = HANDLERS[category] + required, handler, message = HANDLERS[category] if category in header: if required or header[category]: - is_valid &= check_validity(header[category], - handler_function, errors, - error_message) + reporter.check(handler(header[category]), + None, + '{0}\n actual value "{1}"', + message, header[category]) elif required: - msg = 'index file is missing mandatory key "{0}"'.format(category) - add_error(msg, errors) - is_valid = False + reporter.add(None, + 'Missing mandatory key "{0}"', + category) # Check whether we have missing or too many categories seen_categories = set(header.keys()) + check_categories(reporter, REQUIRED, seen_categories, + 'Missing categories') + check_categories(reporter, seen_categories, REQUIRED.union(OPTIONAL), + 'Superfluous categories') - is_valid &= check_categories(REQUIRED, seen_categories, errors, - 'There are missing categories') - - is_valid &= check_categories(seen_categories, REQUIRED.union(OPTIONAL), - errors, 'There are superfluous categories') +def check_config(reporter, filename): + """ + Check YAML configuration file. + """ -def check_config(filename, errors): - '''Check YAML configuration file.''' + config = load_yaml(filename) - with open(filename, 'r') as reader: - config = yaml.load(reader) + kind = config.get('kind', None) + reporter.check(kind == 'workshop', + filename, + 'Missing or unknown kind of event: {0}', + kind) - if config['kind'] != 'workshop': - msg = 'Not configured as a workshop: found "{0}" instead'.format(config['kind']) - add_error(msg, errors) + carpentry = config.get('carpentry', None) + reporter.check(carpentry in ('swc', 'dc'), + filename, + 'Missing or unknown carpentry: {0}', + carpentry) def main(): @@ -418,21 +397,13 @@ def main(): root_dir = sys.argv[1] index_file = os.path.join(root_dir, 'index.html') config_file = os.path.join(root_dir, '_config.yml') - logger.info('Testing "{0}" and "{1}"'.format(index_file, config_file)) - errors = [] - check_config(config_file, errors) + reporter = Reporter() + check_config(reporter, config_file) with open(index_file) as reader: data = reader.read() - check_file(index_file, data, errors) - - if errors: - for m in errors: - logger.error(m) - sys.exit(1) - else: - logger.info('Everything seems to be in order') - sys.exit(0) + check_file(reporter, index_file, data) + reporter.report() if __name__ == '__main__':