| 1 | #!/usr/bin/env python3
|
| 2 | '''
|
| 3 | This tool reads in the headings on a rendered HTML doc page and produces a tree
|
| 4 | of all headings (and their anchors) which can be used as a search index.
|
| 5 |
|
| 6 | The output is a JSON object with the shape Node[] where the Node type is
|
| 7 | defined as:
|
| 8 |
|
| 9 | type Node = {
|
| 10 | symbol: string;
|
| 11 | children: Node[] | undefined;
|
| 12 | anchor: string;
|
| 13 | };
|
| 14 |
|
| 15 | Usage:
|
| 16 |
|
| 17 | doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
|
| 18 | '''
|
| 19 |
|
| 20 | from html.parser import HTMLParser
|
| 21 | import argparse
|
| 22 | import json
|
| 23 | import os
|
| 24 |
|
| 25 |
|
| 26 | class FindHeadings(HTMLParser):
|
| 27 | def __init__(self):
|
| 28 | super().__init__()
|
| 29 |
|
| 30 | self.stack = []
|
| 31 | self.headings = []
|
| 32 | self.anchor = None
|
| 33 | self.heading = None
|
| 34 |
|
| 35 | self.title = None
|
| 36 | self.in_title = False
|
| 37 |
|
| 38 | def handle_starttag(self, tag, attrs):
|
| 39 | if tag == 'title':
|
| 40 | self.title = ''
|
| 41 | self.in_title = True
|
| 42 |
|
| 43 | if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
| 44 | self.stack.append({'tag': tag, 'id': None})
|
| 45 | self.heading = dict(self.stack[-1])
|
| 46 |
|
| 47 | # Preceding each header is a <a name='anchor-name'></a>
|
| 48 | # Collect these anchors as link targets
|
| 49 | if tag == 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
|
| 50 | # This assumes that name is the first attribute on the <a></a>,
|
| 51 | # which is true for our generated HTML.
|
| 52 | self.anchor = attrs[0][1]
|
| 53 |
|
| 54 | def handle_endtag(self, tag):
|
| 55 | if tag == 'title':
|
| 56 | self.in_title = False
|
| 57 |
|
| 58 | if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
|
| 59 | self.stack.pop()
|
| 60 |
|
| 61 | if self.heading and 'title' in self.heading:
|
| 62 | self.headings.append(self.heading)
|
| 63 | self.heading = None
|
| 64 |
|
| 65 | def handle_data(self, data):
|
| 66 | if self.in_title:
|
| 67 | self.title += data
|
| 68 | return
|
| 69 |
|
| 70 | if len(self.stack) == 0 or not self.anchor:
|
| 71 | return
|
| 72 |
|
| 73 | payload = data.strip()
|
| 74 | if not payload:
|
| 75 | return
|
| 76 |
|
| 77 | heading = self.heading
|
| 78 | if heading is None:
|
| 79 | return
|
| 80 |
|
| 81 | if 'title' in heading:
|
| 82 | heading['title'] = heading['title'] + ' ' + payload
|
| 83 | else:
|
| 84 | heading['title'] = payload
|
| 85 | heading['id'] = '#' + self.anchor
|
| 86 |
|
| 87 | def GetSymbols(self, relpath):
|
| 88 | if self.title is None:
|
| 89 | return []
|
| 90 |
|
| 91 | root_title = self.title.strip()
|
| 92 |
|
| 93 | symbols = []
|
| 94 | stack = [] # (level, node)
|
| 95 |
|
| 96 | for heading in self.headings:
|
| 97 | level = int(heading['tag'][1])
|
| 98 | node = {'symbol': heading['title'], 'anchor': relpath + heading['id']}
|
| 99 |
|
| 100 | while stack and stack[-1][0] >= level:
|
| 101 | stack.pop()
|
| 102 |
|
| 103 | if stack:
|
| 104 | parent = stack[-1][1]
|
| 105 | parent_children = parent.setdefault('children', [])
|
| 106 | parent_children.append(node)
|
| 107 | else:
|
| 108 | symbols.append(node)
|
| 109 |
|
| 110 | stack.append((level, node))
|
| 111 |
|
| 112 | return [{'symbol': root_title, 'children': symbols, 'anchor': relpath}]
|
| 113 |
|
| 114 |
|
| 115 | def main():
|
| 116 | parser = argparse.ArgumentParser()
|
| 117 | parser.add_argument(
|
| 118 | '--base-dir', type=str, help='Base directory to reference links from'
|
| 119 | )
|
| 120 | parser.add_argument('html', help='HTML file to extract headings from')
|
| 121 |
|
| 122 | args = parser.parse_args()
|
| 123 |
|
| 124 | with open(args.html) as f:
|
| 125 | source = f.read()
|
| 126 |
|
| 127 | find_headings = FindHeadings()
|
| 128 | find_headings.feed(source)
|
| 129 |
|
| 130 | relpath = os.path.relpath(args.html, start=args.base_dir)
|
| 131 | symbols = find_headings.GetSymbols(relpath)
|
| 132 |
|
| 133 | print(json.dumps(symbols))
|
| 134 |
|
| 135 |
|
| 136 | if __name__ == '__main__':
|
| 137 | main()
|