OILS / doctools / spelling.py View on Github | oils.pub

141 lines, 85 significant
1#!/usr/bin/env python2
2"""spelling.py.
3
4Filter the output of 'lynx -dump' into a list of words to spell check.
5"""
6from __future__ import print_function
7
8from collections import Counter
9import optparse
10import re
11import sys
12
13from doctools.util import log
14
15
16def SplitWords(contents):
17 # Remove URLs so path components don't show up as words
18 contents = re.sub(r'(http|https|file)://\S+', '', contents)
19
20 # Take into account contractions with apostrophes
21 #
22 # - doesn't
23 # - can't
24
25 WORD_RE = re.compile(
26 r'''
27 [a-zA-Z]+
28 (?:\'t\b)? # optional contraction
29 ''', re.VERBOSE)
30
31 words = WORD_RE.findall(contents)
32
33 for w in words:
34 yield w
35
36
37def WordList(f):
38 for line in f:
39 # no special characters allowed
40 yield line.strip()
41
42
43def Options():
44 """Returns an option parser instance."""
45 p = optparse.OptionParser()
46 p.add_option('--known-words',
47 dest='known_words',
48 help='List of words like /usr/share/dict/words')
49 p.add_option(
50 '--more-than-bash',
51 dest='more_than_bash',
52 type=int,
53 default=0,
54 help=
55 'Expected number of cases where OSH starts more processes than bash')
56 return p
57
58
59def main(argv):
60 o = Options()
61 opts, argv = o.parse_args(argv[1:])
62
63 action = argv[0]
64
65 if action == 'word-split':
66 contents = sys.stdin.read()
67 for w in SplitWords(contents):
68 print(w)
69
70 elif action == 'check':
71 word_files = argv[1:]
72
73 d = Counter()
74
75 for path in word_files:
76 with open(path) as f:
77 for word in WordList(f):
78 d[word] += 1
79
80 print('')
81 print('Most common words')
82 print('')
83 for word, count in d.most_common()[:20]:
84 print('%10d %s' % (count, word))
85
86 print('')
87 print('Least common words')
88 print('')
89 for word, count in d.most_common()[-20:]:
90 print('%10d %s' % (count, word))
91
92 log('%d word files', len(word_files))
93 log('%d unique words', len(d))
94
95 known_words = {}
96 with open(opts.known_words) as f:
97 for w in WordList(f):
98 known_words[w] = True
99
100 print('')
101 print('Potential Misspellings')
102 print('')
103
104 for path in word_files:
105
106 print()
107 print('\t%s' % path)
108 print()
109
110 with open(path) as f:
111 unknown = {}
112 for w in WordList(f):
113 #if d.get(word) == 1:
114 # print(word)
115 if w.lower() not in known_words:
116 unknown[w] = True
117
118 if unknown:
119 for u in sorted(unknown):
120 # only occurs once
121 if d.get(u) == 1:
122 print(u)
123 log('\t%d unknown words in %s', len(unknown), path)
124
125 # Checking algorithms:
126 #
127 # - Does it appear in the dictionary? Problem: most computer terms
128 # - Does it appear only once or twice in the whole corpus?
129 # - Is the edit distance very close to a dictinoary word?
130 # - e.g. substitutions is a typo
131
132 else:
133 raise RuntimeError('Invalid action %r' % action)
134
135
136if __name__ == '__main__':
137 try:
138 main(sys.argv)
139 except RuntimeError as e:
140 print('FATAL: %s' % e, file=sys.stderr)
141 sys.exit(1)