1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 import glob
50 import os
51 import platform
52 import sys
53 import unicodedata
54
55 blacklist = (
56 r'HilfeZurCreoleSyntax.txt',
57 r'.')
58
59
60 import re
61 class ReCache:
62
63 cache = dict()
64
65 def compile(self, pattern, flags = 0):
66 cache = self.cache
67 if pattern not in cache:
68 cache[pattern] = dict()
69 cache = cache[pattern]
70 if flags not in cache:
71 cache[flags] = re.compile(pattern, flags)
72 return cache[flags]
73 def finditer(self, pattern, text, flags = 0):
74 return self.compile(pattern, flags).finditer(text)
75 def match(self, pattern, text, flags = 0):
76 return self.compile(pattern, flags).match(text)
77 def search(self, pattern, text, flags = 0):
78 return self.compile(pattern, flags).search(text)
79 def sub(self, pattern, replacement, text, flags = 0):
80 return self.compile(pattern, flags).sub(replacement, text)
81
82
83 import urllib.parse
84 class AnomalyOutputter:
85
86 o = None
87 e = None
88 d = None
89 qoute = r'"'
90 ellipsis = r'…'
91 sol = r'|'
92 eol = r'|'
93 maxPartLength = 70
94 minAfterLength = 20
95
96 pathCount = 0
97 lineCount = 0
98 anomalyCount = 0
99 anomalyCounts = dict()
100 lastPath = r''
101 lastLineNr = 0
102
103 def __init__(self, outputStream, textEscaper, textDecorator):
104 self.o = outputStream
105 self.e = textEscaper
106 self.d = textDecorator
107
108 def out(self, path, lineNr, startColumn, endColumn, line, anomaly):
109 o = self.o
110 e = self.e
111 d = self.d
112 q = self.qoute
113 if self.lastPath != path:
114 self.lastPath = path
115 self.pathCount += 1
116 ePath = d.decorateText(e.escape(path), d.textBCyan)
117 pageName = path.replace(r' - ', r'/')
118 if pageName[-4:] == r'.txt':
119 pageName = pageName[0:-4]
120 url = r'https://larpwiki.de/' + urllib.parse.quote(pageName)
121 eUrl = d.decorateText(url, d.textWhite)
122 o.write('\n%s%s%s\n<%s>:\n' % (q, ePath, q, eUrl))
123 if self.lastLineNr != lineNr:
124 if self.lastLineNr != lineNr:
125 self.lineCount += 1
126 self.lastLineNr = lineNr
127 eLineNr = d.decorateText(str(lineNr + 1), d.textBYellow)
128 o.write(' Line %s:\n' % (eLineNr))
129 self.anomalyCount += 1
130 if anomaly not in self.anomalyCounts:
131 self.anomalyCounts[anomaly] = 1
132 else:
133 self.anomalyCounts[anomaly] += 1
134 eColumn = d.decorateText(str(startColumn + 1), d.textBYellow)
135
136 ml = self.maxPartLength
137
138 t = e.escapeLimitRight(line[startColumn:endColumn], ml)
139 part = t[0]
140 partCpLength = t[1]
141 partComplete = ((endColumn - startColumn - partCpLength) == 0)
142 ml = max(0, ml - len(part))
143
144 if partComplete:
145 mal = min(len(line) - endColumn, ml, int(ml / 2), self.minAfterLength)
146 else:
147 mal = 0
148 bLength = min(startColumn, ml - mal)
149 t = e.escapeLimitLeft(line[:startColumn], bLength)
150 before = t[0]
151 beforeCpLength = t[1]
152 ml = max(0, ml - len(before))
153
154 if partComplete:
155 t = e.escapeLimitRight(line[endColumn:], ml)
156 after = t[0]
157 afterCpLength = t[1]
158 else:
159 after = r''
160 afterCpLength = 0
161
162 if startColumn - beforeCpLength > 0:
163 sol = self.ellipsis
164 else:
165 sol = self.sol
166 if (startColumn + partCpLength + afterCpLength) < len(line):
167 eol = self.ellipsis
168 else:
169 eol = self.eol
170 before = d.decorateText(before, d.textYellow)
171 part = d.decorateText(part, d.textBYellow, d.textUnderline)
172 after = d.decorateText(after, d.textYellow)
173 o.write(' Column %s, anomaly %s%s%s:\n' % (eColumn, q, anomaly, q))
174 o.write(' %s%s%s%s%s%s%s\n' % (sol, q, before, part, after, q, eol))
175
176
177 class AnsiTextDecorator:
178
179 textBlack = r'30'
180 textRed = r'31'
181 textGreen = r'32'
182 textYellow = r'33'
183 textBlue = r'34'
184 textMagenta = r'35'
185 textCyan = r'36'
186 textGrey = r'37'
187 textBGrey = r'30;1'
188 textBRed = r'31;1'
189 textBGreen = r'32;1'
190 textBYellow = r'33;1'
191 textBBlue = r'34;1'
192 textBMagenta = r'35;1'
193 textBCyan = r'36;1'
194 textWhite = r'37;1'
195 textBold = r'1'
196 textItalic = r'3'
197 textUnderline = r'4'
198 backgroundBlack = r'40'
199 backgroundRed = r'41'
200 backgroundGreen = r'42'
201 backgroundYellow = r'43'
202 backgroundBlue = r'44'
203 backgroundMagenta = r'45'
204 backgroundCyan = r'46'
205 backgroundGrey = r'47'
206
207 def decorateText(self, text, *codes):
208 if not len(codes):
209 return text
210 codesString = r''
211 for code in codes:
212 codesString += '\x1B[' + code + r'm'
213 return codesString + text + '\x1B[0m'
214
215 class dummyTextDecorator(AnsiTextDecorator):
216 def decorateText(self, text, *codes):
217 return text
218
219
220 from io import StringIO
221 class TextEscaper:
222
223 def escape(self, text):
224 if not len(text): return r''
225 return repr(text)[1:-1].replace(r'"', r'\"')
226
227 def escapeLimitRight(self, text, maxLength):
228 if maxLength <= 0: return (r'', 0)
229 buffer = StringIO()
230 length = 0
231 cpCount = 0
232 for cp in text:
233 cp = self.escape(cp)
234 newLength = length + len(cp)
235 if newLength > maxLength:
236 break
237 buffer.write(cp)
238 cpCount += 1
239 length = newLength
240 if length == maxLength:
241 break
242 return (buffer.getvalue(), cpCount)
243
244 def escapeLimitLeft(self, text, maxLength):
245 if maxLength <= 0: return (r'', 0)
246 cpList = []
247 length = 0
248 index = len(text)
249 while index > 0:
250 index -= 1
251 cp = self.escape(text[index])
252 newLength = length + len(cp)
253 if newLength > maxLength:
254 break
255 cpList.append(cp)
256 length = newLength
257 if length == maxLength:
258 break
259 cpList.reverse()
260 buffer = StringIO()
261 for cp in cpList:
262 buffer.write(cp)
263 return (buffer.getvalue(), len(cpList))
264
265 o = sys.stdout
266 e = TextEscaper()
267 if o.isatty() and (platform.system() != r'Windows'):
268 d = AnsiTextDecorator()
269 import subprocess
270 cols = int(subprocess.Popen(('tput', 'cols'),
271 stdout=subprocess.PIPE).stdout.read())
272 if cols <= 0:
273 cols = 80
274 else:
275 d = dummyTextDecorator()
276 cols = 80
277 ao = AnomalyOutputter(o, e, d)
278 ao.maxPartLength = cols - 11
279 rec = ReCache()
280
281
282 o.write('Scanning files...\n')
283 fileCount = 0
284 blistedCount = 0
285 paths = glob.iglob(r"*.txt")
286 for path in paths:
287 if not os.path.isfile(path):
288 continue
289 if path in blacklist:
290 blistedCount += 1
291 continue
292 fileCount += 1
293 file = open(path, 'r')
294 lineNr = -1
295 firstDirectiveLine = 1
296 validRedirectPresent = False
297 for line in file:
298 line = rec.sub("\n$", r'', line)
299 lineNr += 1
300 commentLine = (rec.match(r'##+\s', line) != None)
301 directiveLine = not commentLine and (rec.match(r'#[^#]', line) != None)
302
303
304 markAllowed = False
305 lineLength = len(line)
306 for lineIndex, cp in enumerate(line):
307 anomaly = False
308 unexpectedMark = False
309 cpCat = unicodedata.category(cp)
310 cpCatMain = cpCat[0]
311
312 if cpCatMain == r'L':
313 markAllowed = True
314 continue
315
316 if cpCatMain != r'M':
317 markAllowed = False
318
319 if cp == r'�':
320 anomaly = True
321
322 if not anomaly:
323 if cpCatMain in (r'N', r'P', r'S', r'Z') or cp in (
324 '\t',
325 '\xad',
326 '\u200d',
327 '\u200e',
328 None):
329 continue
330 if cpCatMain in (r'M'):
331 if markAllowed:
332 continue
333
334 anomaly = True
335 unexpectedMark = True
336
337
338
339
340 before = line[max(0, lineIndex - 30):lineIndex]
341 after = line[lineIndex + 1:lineIndex + 31]
342 cpName = unicodedata.name(cp, r'unnamed')
343 if unexpectedMark:
344 suffix = r' not preceded by a letter'
345 else:
346 suffix = r''
347 ao.out(path, lineNr, lineIndex, lineIndex + 1, line,
348 (r'Unicode %s (%s, category %s)' + suffix)
349 % (e.escape(cp), cpName, cpCat))
350
351
352 match = rec.match(r'(\*|#(\*|#(\*|#)))[*#]*', line)
353 if match:
354 directiveLine = False
355 commentLine = False
356 start = match.start()
357 end = match.end()
358 ao.out(path, lineNr, start, end, line, r'Old wiki list')
359
360
361 if commentLine:
362 continue
363
364
365 if (firstDirectiveLine == lineNr) and commentLine:
366 firstDirectiveLine += 1
367
368
369 if validRedirectPresent and not directiveLine:
370 match = rec.match(r'\s*(\S.*?)\s*$', line)
371 if match:
372 start = match.start(1)
373 end = match.end(1)
374 ao.out(path, lineNr, start, end, line,
375 r'Non-empty non-comment line after valid redirect')
376 continue
377
378
379 match = rec.match(r'#REDIRECT(\s*)(?P<name>.*)', line)
380 if match:
381 if firstDirectiveLine:
382 name = match.group(r'name')
383 if not name:
384 ao.out(path, lineNr, 0, len(line), line, r'Redirect without target')
385 else:
386 validRedirectPresent = True
387 else:
388 ao.out(path, lineNr, 0, len(line), line, r'Redirect in non-first line')
389 continue
390
391
392 if directiveLine:
393 continue
394
395
396 match = rec.match(r'''^(?P<firstChar>[:;])((?P<extraChars>[:;]*)|($|
397 [^-\(\{\[\|\)\}\]pPD] # Do not match smilies.
398 ))''', line, re.VERBOSE)
399 if match:
400 linePartPos = match.start()
401 linePart = match.group()
402 firstChar = match.group(r'firstChar')
403 extraCount = len(match.group(r'extraChars'))
404 end = 1 + extraCount
405 if firstChar == r':':
406 ao.out(path, lineNr, 0, end, line, r'Old wiki indenting')
407 continue
408 if firstChar == r';':
409 ao.out(path, lineNr, 0, end, line, r'Old wiki definition list')
410 continue
411
412
413 matches = rec.finditer(r'''<(?P<close>[/]?)(?P<name>(
414 b|i|nowiki|pre|toc|tt
415 ))>''', line, re.IGNORECASE | re.VERBOSE)
416 for match in matches:
417 start = match.start()
418 end = match.end()
419 closing = match.group(r'close')
420 tagName = match.group(r'name')
421 if closing:
422 tagType = r'close'
423 else:
424 tagType = r'open'
425 ao.out(path, lineNr, start, end, line, r'Old wiki tag %s %s'
426 % (tagName, tagType))
427
428
429 matches = rec.finditer(r'''
430 (?P<open><[<`]*)
431 (?P<name>br)
432 (?P<close>[>`]*>)
433 ''', line, re.IGNORECASE | re.VERBOSE)
434 for match in matches:
435 start = match.start()
436 end = match.end()
437 linePart = match.group()
438 tagOpen = match.group(r'open')
439 tagName = match.group(r'name')
440 tagClose = match.group(r'close')
441 if (tagOpen == '<') and (tagClose == '>'):
442 ao.out(path, lineNr, start, end, line, r'Old wiki linebreak')
443 continue
444 if (tagOpen == '<<') and (tagClose[0:2] == '>>') and (tagName != 'BR'):
445 ao.out(path, lineNr, start, end, line, r'Invalid linebreak')
446 continue
447
448
449 matches = rec.finditer(r'''^
450 (?P<spaceBeforOpen>\s*) # Illegal.
451 (?P<openTag>[=]+) # Headline open tag.
452 (?P<spaceAfterOpen>\s*) # Required.
453 (?P<nIndicator>[\#*]*)\s* # Numbering indicator from old wiki.
454 (?P<text>.*?) # Required headline text (non-greedy).
455 (?P<spaceBeforClose>\s*) # Required.
456 (?P<closeTag>[=]*) # Has to be same as open tag.
457 (?P<spaceAfterClose>\s*) # Illegal trailing whitespace.
458 $''', line, re.VERBOSE)
459 for match in matches:
460 linePartPos = match.start()
461 linePart = match.group()
462 spaceBeforOpen = match.group(r'spaceBeforOpen')
463 openTag = match.group(r'openTag')
464 openTagStart = match.start(r'openTag')
465 openTagEnd = match.end(r'openTag')
466 spaceAfterOpen = match.group(r'spaceAfterOpen')
467 nIndicator = match.group(r'nIndicator')
468 text = match.group(r'text')
469 spaceBeforClose = match.group(r'spaceBeforClose')
470 closeTag = match.group(r'closeTag')
471 spaceAfterClose = match.group(r'spaceAfterClose')
472 if spaceBeforOpen:
473 end = len(spaceBeforOpen)
474 ao.out(path, lineNr, 0, end, line, r'Headline starts with whitespace')
475 if len(openTag) > 5:
476 start = openTagStart
477 end = openTagEnd
478 ao.out(path, lineNr, start, end, line, r'Headline of level > 5')
479 if not text:
480 end = len(line)
481 start = openTagEnd - 1
482 ao.out(path, lineNr, start, end, line, r'Headline contains no text')
483 continue
484 else:
485 iMatches = rec.finditer(r"[`']{2,}", text)
486 for iMatch in iMatches:
487 start = match.start(r'text') + iMatch.start()
488 end = match.start(r'text') + iMatch.end()
489 ao.out(path, lineNr, start, end, line,
490 r'Headline text contains markup')
491 if not spaceAfterOpen:
492 if nIndicator:
493 start = match.start(r'nIndicator')
494 else:
495 start = match.start(r'text')
496 ao.out(path, lineNr, start, start + 1, line,
497 r'Headline without whitespace after open tag')
498 if nIndicator:
499 start = match.start(r'nIndicator')
500 end = match.end(r'nIndicator')
501 ao.out(path, lineNr, start, end, line,
502 r'Headline with old numbering indicator')
503 if not closeTag:
504 ao.out(path, lineNr, len(line) - 1, len(line), line,
505 r'Headline without close tag')
506 continue
507 if len(openTag) != len(closeTag):
508 start = match.start(r'closeTag')
509 end = match.end(r'closeTag')
510 ao.out(path, lineNr, start, end, line,
511 r'Headline with different length open and close tags')
512 if not spaceBeforClose:
513 start = match.start(r'closeTag')
514 ao.out(path, lineNr, start, start + 1, line,
515 r'Headline without whitespace before close tag')
516 if spaceAfterClose:
517 start = match.start(r'spaceAfterClose')
518 end = match.end(r'spaceAfterClose')
519 ao.out(path, lineNr, start, end, line, r'Headline ends with whitespace')
520
521
522 matches = rec.finditer(r'''
523 (?P<openBrackets>\[[\[`]*) # Link open brackets (2 for valid links).
524 (?P<openQuote>"?) # Artifact from old wiki conversion.
525 \s*
526 (?P<linkUrl>.*?) # Link URL (not greedy).
527 \s*
528 (?P<closeQuote>"?) # Artifact from old wiki conversion.
529 (?P<closeBrackets>[\]`]*\]) # Link open brackets (2 for valid links).
530 ''', line, re.IGNORECASE | re.VERBOSE)
531 for match in matches:
532 start = match.start()
533 end = match.end()
534 linePart = match.group()
535 openBrackets = match.group(r'openBrackets')
536 openQuote = match.group(r'openQuote')
537 linkUrl = match.group(r'linkUrl')
538 closeQuote = match.group(r'closeQuote')
539 closeBrackets = match.group(r'closeBrackets')
540 if openQuote:
541 ao.out(path, lineNr, start, end, line,
542 r'Fail-converted unnamed internal link')
543 continue
544 if (len(openBrackets) == 1) and rec.search(r':', linkUrl):
545 ao.out(path, lineNr, start, end, line,
546 r'Fail-converted external link')
547 continue
548
549
550 matches = rec.finditer(r'(^|\s)(?P<link>upload:\S+)(\s|$)', line, re.I)
551 for match in matches:
552 start = match.start(r'link')
553 end = match.end(r'link')
554 ao.out(path, lineNr, start, end, line,
555 r'Old wiki upload link')
556
557 file.close()
558 eFileCount = d.decorateText(str(fileCount), d.textBYellow)
559 eBlistedCount = d.decorateText(str(blistedCount), d.textBYellow)
560 if ao.anomalyCount:
561 eAnomalyCount = d.decorateText(str(ao.anomalyCount), d.textBYellow)
562 eLineCount = d.decorateText(str(ao.lineCount), d.textBYellow)
563 ePathCount = d.decorateText(str(ao.pathCount), d.textBYellow)
564 o.write(('\nFound %s anomalies in %s lines from %s files'
565 + ' (%s scanned, %s excluded):\n')
566 % (eAnomalyCount, eLineCount, ePathCount, eFileCount, eBlistedCount))
567 anomalyCounts = ao.anomalyCounts
568 maxValue = sorted(anomalyCounts.values())[-1]
569 format = r'%' + repr(len(repr(maxValue))) + r'i';
570 keys = sorted(anomalyCounts.keys())
571 for key in keys:
572 eCount = d.decorateText(format % (anomalyCounts[key]), d.textBYellow)
573 o.write(' %s %s\n' % (eCount, key))
574 else:
575 o.write('\nFound no anomalies in %i files (%s excluded).\n'
576 % (fileCount, eBlistedCount))