') get_title = re.compile('<title>(.*)

#!/usr/bin/python3 # import os import sys import re import argparse import string ### page-to-template-data: # # This program converts a web page or blog post to a file consisting # of an YAML header followed by the contents of the body tag. # Server-side includes are removed. # # The contents of the title tag are put in the title: header, and each # becomes x: y # # The headers can be output to a separate file. # long_description = '''Convert web page or blog post to html fragment with metadata. This is a rather hacky python program that converts a web page (typically index.html) or an HTML fragment with an RF822 (email) header into a body-contents fragment with a YAML header. The main use is converting web pages and blog posts to something that can be the input to a templating engine (e.g. mustache) or site generator (e.g. Jekyll). RFC822-style headers are currently used in blog posts. The --metafile option lets you direct the metadata to a different file. The --jekyll option produces Jekyll frontmatter, preceeded and followed by ---. If the input is a blog post with an RFC822 header, a jekyll header is the default. This program cannot convert a YAML header to RFC822. ''' # Note that we make no attempt to actually parse the HTML argparser = argparse.ArgumentParser(description=long_description) argparser.add_argument('-j', '--jekyll', action='store_true', help='generate Jekyll metadata') argparser.add_argument('-o', '--outfile', help='output file name - default stdout') argparser.add_argument('-m', '--metafile', help='metadata file name - default outfile') argparser.add_argument('infile', help='input file') args = argparser.parse_args() input_file_name = args.infile output_file_name = args.outfile meta_file_name = args.metafile jekyll = args.jekyll ########################################################################## # Functions: # Extend line with additional lines from file until regex.search succeeds. # This is used to get a complete tag or element assuming that the start # tag is contained in the line. # Return (line, match) where line is whatever remains after the match. def get_complete_tag(regex, line, file): while True: match = regex.search(line) if match: return (line[match.end():], match) re.sub(r'\n', ' ', line) line = line + file.readline() # remove backslashes from a string. # We need this because our mail-like headers get quoted in shell scripts, # and contain backslash-escaped single quotes. def unescape_string(s): if not s: return "" else: return s.replace('\\', '') # Extract metadata from the element of the file. # The metadata is returned as a dict. # If the file starts out looking like a mail header, use extract_mail_metadata def extract_metadata(vfile): metadata = {} end_head = re.compile('') title = re.compile('') get_title = re.compile('<title>(.*)') meta = re.compile(r'') line = file.readline() if re.search(r'^(\w)+:', line): # If it looks like we have a header, use extract_mail_metadata return extract_mail_metadata(line, file) while not end_head.search(line): if title.search(line): (line, match) = get_complete_tag(get_title, line, file) metadata['title'] = match.group(1) continue if meta.search(line): (line, match) = get_complete_tag(get_meta, line, file) metadata[match.group(1).lower()] = match.group(2) continue try: # readline just returns an empty string at EOF. line = next(file) # so we use the file as an iterator except StopIteration: # and handle the exception we get when it's done print("No tag found", file=sys.stderr) exit(1) # otherwise we would hang if there's no # If the tag starts on the same line as , remove it. # (note that it might continue for several lines) if re.search(r']*>'), line, file) return metadata # Extract metadata from an email (rfc822) header # This gets called in case the file looks like a DW post # In that case, we also want to set jekyll to true. def extract_mail_metadata(line, file): global jekyll jekyll = True; metadata = {} get_header = re.compile(r'^(\w+):\s*([^\s].*)?$') while not re.search(r'^\s*$', line): match = get_header.search(line) if not match: break; # Subject -> title. All keys to lowercase. if 'Subject' == match.group(1): metadata['title'] = match.group(2) else: metadata[match.group(1).lower()] = unescape_string(match.group(2)) try: # readline just returns an empty string at EOF. line = next(file) # so we use the file as an iterator except StopIteration: # and handle the exception we get when it's done print("No body found", file=sys.stderr) exit(1) # otherwise we would hang if there's no return metadata # Extract the contents of the element, and return it as a list of lines. # The , , and tags and server-side includes are skipped. # We assume that the end tags and SSI comments are not continued onto other lines. # The tag is skipped in extract_metadata if it's on the same line as . # # TODO: This should be rewritten using a real parser, e.g. Beautiful Soup. # def extract_body_contents(file): body_text = [] start_cut = re.compile(r']*>') end_cut = re.compile(r'^(.*)') colophon= re.compile(r'^]*>'), line, file) continue if re.search(r'