#!/usr/bin/python3 # import os import sys import re import argparse import string ### page-to-template-data: # # This program converts a web page or blog post to a file consisting # of an YAML header followed by the contents of the body tag. # Server-side includes are removed. # # The contents of the title tag are put in the title: header, and each # becomes x: y # # The headers can be output to a separate file. # long_description = '''Convert web page or blog post to html fragment with metadata. This is a rather hacky python program that converts a web page (typically index.html) or an HTML fragment with an RF822 (email) header into a body-contents fragment with a YAML header. The main use is converting web pages and blog posts to something that can be the input to a templating engine (e.g. mustache) or site generator (e.g. Jekyll). RFC822-style headers are currently used in blog posts. The --metafile option lets you direct the metadata to a different file. The --jekyll option produces Jekyll frontmatter, preceeded and followed by ---. If the input is a blog post with an RFC822 header, a jekyll header is the default. This program cannot convert a YAML header to RFC822. ''' # Note that we make no attempt to actually parse the HTML argparser = argparse.ArgumentParser(description=long_description) argparser.add_argument('-j', '--jekyll', action='store_true', help='generate Jekyll metadata') argparser.add_argument('-o', '--outfile', help='output file name - default stdout') argparser.add_argument('-m', '--metafile', help='metadata file name - default outfile') argparser.add_argument('infile', help='input file') args = argparser.parse_args() input_file_name = args.infile output_file_name = args.outfile meta_file_name = args.metafile jekyll = args.jekyll ########################################################################## # Functions: # Extend line with additional lines from file until regex.search succeeds. # This is used to get a complete tag or element assuming that the start # tag is contained in the line. # Return (line, match) where line is whatever remains after the match. def get_complete_tag(regex, line, file): while True: match = regex.search(line) if match: return (line[match.end():], match) re.sub(r'\n', ' ', line) line = line + file.readline() # remove backslashes from a string. # We need this because our mail-like headers get quoted in shell scripts, # and contain backslash-escaped single quotes. def unescape_string(s): if not s: return "" else: return s.replace('\\', '') # Extract metadata from the
element of the file. # The metadata is returned as a dict. # If the file starts out looking like a mail header, use extract_mail_metadata def extract_metadata(vfile): metadata = {} end_head = re.compile('') title = re.compile('