1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48 import re
49 import glob
50 import os
51 import platform
52 import sys
53 import unicodedata
54 import urllib.parse
55 from io import StringIO
56 import codecs
57
58 baseDir = os.path.dirname(__file__)
59
60 sourceDir = os.path.join(baseDir, 'backup')
61 blacklist = (
62 'HilfeZurCreoleSyntax.txt',
63 )
64
65
66 class ReCache:
67 cache = dict()
68
69 def compile(self, pattern, flags = 0):
70 cache = self.cache
71 if pattern not in cache:
72 cache[pattern] = dict()
73 cache = cache[pattern]
74 if flags not in cache:
75 cache[flags] = re.compile(pattern, flags)
76 return cache[flags]
77
78 def finditer(self, pattern, text, flags = 0):
79 return self.compile(pattern, flags).finditer(text)
80
81 def match(self, pattern, text, flags = 0):
82 return self.compile(pattern, flags).match(text)
83
84 def search(self, pattern, text, flags = 0):
85 return self.compile(pattern, flags).search(text)
86
87 def sub(self, pattern, replacement, text, flags = 0):
88 return self.compile(pattern, flags).sub(replacement, text)
89
90
91 class AnomalyOutputter:
92
93 qoute = '"'
94 ellipsis = '…'
95 sol = '|'
96 eol = '|'
97 maxPartLength = 70
98 minAfterLength = 20
99
100 pathCount = 0
101 lineCount = 0
102 anomalyCount = 0
103 anomalyCounts = dict()
104 lastPath = r''
105 lastLineNr = 0
106
107 def __init__(self, outputStream, textEscaper, textDecorator):
108 self.o = outputStream
109 self.e = textEscaper
110 self.d = textDecorator
111
112 def out(self, path, lineNr, startColumn, endColumn, line, anomaly):
113 o = self.o
114 e = self.e
115 d = self.d
116 q = self.qoute
117 if self.lastPath != path:
118 self.lastPath = path
119 self.pathCount += 1
120 ePath = d.decorateText(e.escape(path), d.textBCyan)
121 pageName = os.path.basename(path).replace(r' - ', '/')
122 if pageName[-4:] == '.txt':
123 pageName = pageName[0:-4]
124 url = 'https://larpwiki.de/' + urllib.parse.quote(pageName)
125 eUrl = d.decorateText(url, d.textWhite)
126 o.write('\n{0}:\n {1}\n'.format(ePath, eUrl))
127 if self.lastLineNr != lineNr:
128 if self.lastLineNr != lineNr:
129 self.lineCount += 1
130 self.lastLineNr = lineNr
131 eLineNr = d.decorateText(str(lineNr + 1), d.textBYellow)
132 o.write(' Line {0}:\n'.format(eLineNr))
133 self.anomalyCount += 1
134 if anomaly not in self.anomalyCounts:
135 self.anomalyCounts[anomaly] = 1
136 else:
137 self.anomalyCounts[anomaly] += 1
138 eColumn = d.decorateText(str(startColumn + 1), d.textBYellow)
139
140 ml = self.maxPartLength
141
142
143 t = e.escapeLimitRight(line[startColumn:endColumn], ml)
144 part = t[0]
145 partCpLength = t[1]
146 partComplete = ((endColumn - startColumn - partCpLength) == 0)
147 ml = max(0, ml - len(part))
148
149
150 if partComplete:
151 mal = min(len(line) - endColumn, int(ml / 2), self.minAfterLength)
152 else:
153 mal = 0
154 bLength = min(startColumn, ml - mal)
155 t = e.escapeLimitLeft(line[:startColumn], bLength)
156 before = t[0]
157 beforeCpLength = t[1]
158 ml = max(0, ml - len(before))
159
160
161 if partComplete:
162 t = e.escapeLimitRight(line[endColumn:], ml)
163 after = t[0]
164 afterCpLength = t[1]
165 else:
166 after = r''
167 afterCpLength = 0
168
169 if startColumn - beforeCpLength > 0:
170 sol = self.ellipsis
171 else:
172 sol = self.sol
173 if (startColumn + partCpLength + afterCpLength) < len(line):
174 eol = self.ellipsis
175 else:
176 eol = self.eol
177 before = d.decorateText(before, d.textYellow)
178 part = d.decorateText(part, d.textBYellow, d.textUnderline)
179 after = d.decorateText(after, d.textYellow)
180 msg = ' Column {1}, anomaly {0}{2}{0}:\n'
181 o.write(msg.format(q, eColumn, anomaly))
182 msg = ' {1}{0}{2}{3}{4}{0}{5}\n'
183 o.write(msg.format(q, sol, before, part, after, eol))
184
185
186 class AnsiTextDecorator:
187
188 textBlack = r'30'
189 textRed = r'31'
190 textGreen = r'32'
191 textYellow = r'33'
192 textBlue = r'34'
193 textMagenta = r'35'
194 textCyan = r'36'
195 textGrey = r'37'
196 textBGrey = r'30;1'
197 textBRed = r'31;1'
198 textBGreen = r'32;1'
199 textBYellow = r'33;1'
200 textBBlue = r'34;1'
201 textBMagenta = r'35;1'
202 textBCyan = r'36;1'
203 textWhite = r'37;1'
204 textBold = r'1'
205 textItalic = r'3'
206 textUnderline = r'4'
207 backgroundBlack = r'40'
208 backgroundRed = r'41'
209 backgroundGreen = r'42'
210 backgroundYellow = r'43'
211 backgroundBlue = r'44'
212 backgroundMagenta = r'45'
213 backgroundCyan = r'46'
214 backgroundGrey = r'47'
215
216 def decorateText(self, text, *codes):
217 if not len(codes):
218 return text
219 codesString = r''
220 for code in codes:
221 codesString += '\x1B[' + code + r'm'
222 return codesString + text + '\x1B[0m'
223
224 class dummyTextDecorator(AnsiTextDecorator):
225
226 def decorateText(self, text, *codes):
227 return text
228
229
230 class TextEscaper:
231
232 def escape(self, text):
233 if not len(text): return r''
234 return repr(text)[1:-1].replace(r'"', r'\"')
235
236 def escapeLimitRight(self, text, maxLength):
237 if maxLength <= 0:
238 return r'', 0
239 buffer = StringIO()
240 length = 0
241 cpCount = 0
242 for cp in text:
243 cp = self.escape(cp)
244 newLength = length + len(cp)
245 if newLength > maxLength:
246 break
247 buffer.write(cp)
248 cpCount += 1
249 length = newLength
250 if length == maxLength:
251 break
252 return buffer.getvalue(), cpCount
253
254 def escapeLimitLeft(self, text, maxLength):
255 if maxLength <= 0:
256 return r'', 0
257 cpList = []
258 length = 0
259 index = len(text)
260 while index > 0:
261 index -= 1
262 cp = self.escape(text[index])
263 newLength = length + len(cp)
264 if newLength > maxLength:
265 break
266 cpList.insert(0, cp)
267 length = newLength
268 if length == maxLength:
269 break
270 buffer = StringIO()
271 for cp in cpList:
272 buffer.write(cp)
273 return buffer.getvalue(), len(cpList)
274
275 def main():
276 o = sys.stdout
277 e = TextEscaper()
278 if o.isatty() and (platform.system() != 'Windows'):
279 d = AnsiTextDecorator()
280 import subprocess
281 cols = int(subprocess.Popen(('tput', 'cols'),
282 stdout=subprocess.PIPE).stdout.read())
283 if cols <= 0:
284 cols = 80
285 else:
286 d = dummyTextDecorator()
287 cols = 80
288 ao = AnomalyOutputter(o, e, d)
289 ao.maxPartLength = cols - 11
290 rec = ReCache()
291 fileCount = 0
292 blistedCount = 0
293 try:
294 o.write('Scanning files...\n')
295 paths = glob.iglob(os.path.join(sourceDir, "*.txt"))
296 for path in paths:
297 if not os.path.isfile(path):
298 continue
299 if path in blacklist:
300 blistedCount += 1
301 continue
302 fileCount += 1
303
304
305 with open(path, 'rb') as file:
306 textBytes = file.read()
307 decoder = codecs.getincrementaldecoder('utf-8')()
308 lines, line, invalidEncoding = [], [], False
309 lastI = len(textBytes) + 1
310 for i in range(0, len(textBytes)):
311 try:
312 cp = decoder.decode(textBytes[i:i+1], i == lastI)
313 if len(cp) != 0:
314 if cp == '\n':
315 if line[-1:] == ['\r']:
316 del line[-1]
317 lines.append(''.join(line))
318 line = []
319 else:
320 line.append(cp)
321 except ValueError:
322 invalidEncoding = True
323 lineNr, cpIndex = len(lines) + 1, len(line)
324 lineStr = ''.join(line)
325 msg = r'UTF-8 invalid byte while decoding line!'
326 ao.out(path, lineNr, cpIndex, cpIndex + 1, lineStr, msg)
327 break
328 if invalidEncoding:
329 continue
330 lines.append(''.join(line))
331
332 firstDirectiveLine = 1
333 validRedirectPresent = False
334 for lineNr, line in enumerate(lines):
335 commentLine = line.startswith('##')
336 directiveLine = not commentLine and line.startswith('#')
337
338
339 markAllowed = False
340 for cpIndex, cp in enumerate(line):
341 anomaly = True
342 unexpectedMark = False
343 cpCat = unicodedata.category(cp)
344 cpCatMain = cpCat[0]
345
346
347
348 if cpCatMain in 'LNPSZ' or cp in (
349 '\t',
350 '\xad',
351 '\u200d',
352 '\u200e',
353 None
354 ):
355 anomaly = False
356
357
358
359 if cp == r'�':
360 anomaly = True
361
362
363 if cpCatMain == 'M':
364 if markAllowed:
365 anomaly = False
366 else:
367
368 anomaly, unexpectedMark = True, True
369 elif cpCatMain == 'L':
370 markAllowed = True
371 else:
372 markAllowed = False
373
374 if anomaly:
375 cpName = unicodedata.name(cp, r'unnamed')
376 if unexpectedMark:
377 suffix = ' not preceded by a letter'
378 else:
379 suffix = ''
380 msg = r'Unicode {0} ({1}, category {2}){3}'
381 msg = msg.format(e.escape(cp), cpName, cpCat, suffix)
382 ao.out(path, lineNr, cpIndex, cpIndex + 1, line, msg)
383
384
385 match = rec.match(r'(\*|#(\*|#(\*|#)))[*#]*', line)
386 if match:
387 directiveLine = False
388 commentLine = False
389 start = match.start()
390 end = match.end()
391 ao.out(path, lineNr, start, end, line, 'Old wiki list')
392
393
394
395 if commentLine:
396 continue
397
398
399 if (firstDirectiveLine == lineNr) and commentLine:
400 firstDirectiveLine += 1
401
402
403 if validRedirectPresent and not directiveLine:
404 match = rec.match(r'\s*(\S.*?)\s*$', line)
405 if match:
406 start = match.start(1)
407 end = match.end(1)
408 msg = 'Non-empty non-comment line after valid redirect'
409 ao.out(path, lineNr, start, end, line, msg)
410 continue
411
412
413 match = rec.match(r'#REDIRECT(\s*)(?P<name>.*)', line)
414 if match:
415 if firstDirectiveLine:
416 name = match.group(r'name')
417 if not name:
418 msg = 'Redirect without target'
419 ao.out(path, lineNr, 0, len(line), line, msg)
420 else:
421 validRedirectPresent = True
422 else:
423 msg = 'Redirect in non-first line'
424 ao.out(path, lineNr, 0, len(line), line, msg)
425 continue
426
427
428 if directiveLine:
429 continue
430
431
432 match = rec.match(r'''^(?P<firstChar>[:;])((?P<extraChars>[:;]*)
433 |($|[^-\(\{\[\|\)\}\]pPD] # Do not match smilies.
434 ))''', line, re.VERBOSE)
435 if match:
436 firstChar = match.group(r'firstChar')
437 extraCount = len(match.group(r'extraChars'))
438 end = 1 + extraCount
439 if firstChar == r':':
440 msg = 'Old wiki indenting'
441 ao.out(path, lineNr, 0, end, line, msg)
442 continue
443 if firstChar == r';':
444 msg = 'Old wiki definition list'
445 ao.out(path, lineNr, 0, end, line, msg)
446 continue
447
448
449 matches = rec.finditer(r'''<(?P<close>[/]?)(?P<name>(
450 b|i|nowiki|pre|toc|tt
451 ))>''', line, re.IGNORECASE | re.VERBOSE)
452 for match in matches:
453 start = match.start()
454 end = match.end()
455 closing = match.group(r'close')
456 tagName = match.group(r'name')
457 tagType = 'close' if closing else 'open'
458 msg = 'Old wiki tag {0} {1}'.format(tagName, tagType)
459 ao.out(path, lineNr, start, end, line, msg)
460
461
462 matches = rec.finditer(r'''
463 (?P<open><[<`]*)
464 (?P<name>br)
465 (?P<close>[>`]*>)
466 ''', line, re.IGNORECASE | re.VERBOSE)
467 for match in matches:
468 start = match.start()
469 end = match.end()
470 tagOpen = match.group('open')
471 tagName = match.group('name')
472 tagClose = match.group('close')
473 if (tagOpen == '<') and (tagClose == '>'):
474 msg = 'Old wiki linebreak'
475 ao.out(path, lineNr, start, end, line, msg)
476 continue
477 if ((tagOpen == '<<') and (tagClose[0:2] == '>>')
478 and (tagName != 'BR')):
479 msg = 'Invalid linebreak'
480 ao.out(path, lineNr, start, end, line, msg)
481 continue
482
483
484 matches = rec.finditer(r'''^
485 (?P<spaceBeforOpen>\s*) # Illegal.
486 (?P<openTag>[=]+) # Headline open tag.
487 (?P<spaceAfterOpen>\s*) # Required.
488 (?P<nIndicator>[\#*]*)\s* # Numbering from old wiki.
489 (?P<text>.*?) # Required headline text (non-greedy).
490 (?P<spaceBeforClose>\s*) # Required.
491 (?P<closeTag>[=]*) # Has to be same as open tag.
492 (?P<spaceAfterClose>\s*) # Illegal trailing whitespace.
493 $''', line, re.VERBOSE)
494 for match in matches:
495 spaceBeforOpen = match.group('spaceBeforOpen')
496 openTag = match.group('openTag')
497 openTagStart = match.start('openTag')
498 openTagEnd = match.end('openTag')
499 spaceAfterOpen = match.group('spaceAfterOpen')
500 nIndicator = match.group('nIndicator')
501 text = match.group('text')
502 spaceBeforClose = match.group('spaceBeforClose')
503 closeTag = match.group('closeTag')
504 spaceAfterClose = match.group('spaceAfterClose')
505 if spaceBeforOpen:
506 end = len(spaceBeforOpen)
507 msg = 'Headline starts with whitespace'
508 ao.out(path, lineNr, 0, end, line, msg)
509 if len(openTag) > 5:
510 start = openTagStart
511 end = openTagEnd
512 msg = 'Headline of level > 5'
513 ao.out(path, lineNr, start, end, line, msg)
514 if text:
515 iMatches = rec.finditer(r"[`']{2,}", text)
516 for iMatch in iMatches:
517 start = match.start('text') + iMatch.start()
518 end = match.start('text') + iMatch.end()
519 msg = 'Headline text contains markup'
520 ao.out(path, lineNr, start, end, line, msg)
521 else:
522 end = len(line)
523 start = openTagEnd - 1
524 msg = 'Headline contains no text'
525 ao.out(path, lineNr, start, end, line, msg)
526 continue
527 if not spaceAfterOpen:
528 if nIndicator:
529 start = match.start('nIndicator')
530 else:
531 start = match.start('text')
532 msg = 'Headline without whitespace after open tag'
533 ao.out(path, lineNr, start, start + 1, line, msg)
534 if nIndicator:
535 start = match.start('nIndicator')
536 end = match.end('nIndicator')
537 msg = 'Headline with old numbering indicator'
538 ao.out(path, lineNr, start, end, line, msg)
539 if not closeTag:
540 msg = 'Headline without close tag'
541 ao.out(path, lineNr, len(line)-1, len(line), line, msg)
542
543 continue
544 if len(openTag) != len(closeTag):
545 start = match.start('closeTag')
546 end = match.end('closeTag')
547 msg = ('Headline with different length open and close'
548 + ' tags')
549 ao.out(path, lineNr, start, end, line, msg)
550 if not spaceBeforClose:
551 start = match.start('closeTag')
552 msg = 'Headline without whitespace before close tag'
553 ao.out(path, lineNr, start, start + 1, line, msg)
554 if spaceAfterClose:
555 start = match.start('spaceAfterClose')
556 end = match.end('spaceAfterClose')
557 msg = 'Headline ends with whitespace'
558 ao.out(path, lineNr, start, end, line, msg)
559
560
561 matches = rec.finditer(r'''
562 (?P<openBrackets>\[[\[`]*) # Valid links got 2 brackets
563 (?P<openQuote>"?) # Artifact from old wiki conversion
564 \s*
565 (?P<linkUrl>.*?) # Link URL (not greedy)
566 \s*
567 (?P<closeQuote>"?) # Artifact from old wiki conversion
568 (?P<closeBrackets>[\]`]*\]) # Valid links got 2 brackets
569 ''', line, re.IGNORECASE | re.VERBOSE)
570 for match in matches:
571 start = match.start()
572 end = match.end()
573 openBrackets = match.group('openBrackets')
574 openQuote = match.group('openQuote')
575 linkUrl = match.group('linkUrl')
576 if openQuote:
577 msg = 'Fail-converted unnamed internal link'
578 ao.out(path, lineNr, start, end, line, msg)
579 continue
580 if (len(openBrackets) == 1) and rec.search(r':', linkUrl):
581 msg = 'Fail-converted external link'
582 ao.out(path, lineNr, start, end, line, msg)
583 continue
584
585
586 reStr = r'(^|\s)(?P<link>upload:\S+)(\s|$)'
587 matches = rec.finditer(reStr, line, re.I)
588 for match in matches:
589 start = match.start('link')
590 end = match.end('link')
591 msg = 'Old wiki upload link'
592 ao.out(path, lineNr, start, end, line, msg)
593
594 except KeyboardInterrupt:
595 o.write('\nProcessing interrupted by user!\n')
596
597 eFileCount = d.decorateText(str(fileCount), d.textBYellow)
598 eBlistedCount = d.decorateText(str(blistedCount), d.textBYellow)
599 if ao.anomalyCount:
600 eAnomalyCount = d.decorateText(str(ao.anomalyCount), d.textBYellow)
601 eLineCount = d.decorateText(str(ao.lineCount), d.textBYellow)
602 ePathCount = d.decorateText(str(ao.pathCount), d.textBYellow)
603 msg = ('\nFound {0} anomalies in {1} lines from {2} files'
604 + ' ({3} scanned, {4} excluded):\n')
605 o.write(msg.format(eAnomalyCount, eLineCount, ePathCount, eFileCount
606 , eBlistedCount))
607 anomalyCounts = ao.anomalyCounts
608 maxValue = sorted(anomalyCounts.values())[-1]
609 maxValueLen = len(str(maxValue))
610 keys = sorted(anomalyCounts.keys())
611 for key in keys:
612 eCount = '{0:{1}}'.format(anomalyCounts[key], maxValueLen)
613 eCount = d.decorateText(eCount, d.textBYellow)
614 o.write(' {0} {1}\n'.format(eCount, key))
615 else:
616 msg = '\nFound no anomalies in {0} files ({1} excluded).\n'
617 o.write(msg.format(fileCount, eBlistedCount))
618
619 if __name__ == '__main__':
620 main()