attachment:larpWikiAnomalyScannerV1.py von LarpWikiAnomalyScanner

Dateianhang 'larpWikiAnomalyScannerV1.py'

   1 #!/usr/bin/env python
   2 #
   3 # Scans all wiki page sources in current directory and outputs found anomalies
   4 # to stdout in a human readable format.
   5 #
   6 # Requires Python 3
   7 #
   8 # Detected anomalies:
   9 # - Obscure code points:
  10 #   - Replacement code point "�".
  11 #   - Marks in grapheme clusters without a leading letter.
  12 #   - Category C except tab.
  13 # - Invalid wiki directives:
  14 #   - Redirect in other than first line after optional leading comments.
  15 #   - Any non-comment non-directive after valid redirect.
  16 # - Old wiki tags:
  17 #   - <b>
  18 #   - <br>
  19 #   - <i>
  20 #   - <nowiki>
  21 #   - <pre>
  22 #   - <toc>
  23 #   - <tt>
  24 # - Tag case:
  25 #   - <<BR>>
  26 # - Headlines:
  27 #   - leading or trailing whitespace.
  28 #   - Open and close tags of differing length.
  29 #   - Level > 5.
  30 #   - Missing whitespace between tags and headline text.
  31 #   - Headlines with leading "#" or "*" in name (leftovers from old wiki).
  32 #   - Missing headline text (except "#" or "*").
  33 #   - Markup in text.
  34 # - Links:
  35 #   - Quoted internal links (failed old wiki conversion).
  36 #   - Old-wiki-style external links.
  37 #   - Old-wiki-style upload/attachment links.
  38 # - Lists:
  39 #   - Old-wiki-style bullet lists (line starts with '*')
  40 #   - Old-wiki-style numbered list (line starts with '#') when mixed with 
  41 #     bullet lists (else they look like a directive or comment).
  42 # - Old wiki paragraph modes:
  43 #   - Indenting (leading ':').
  44 #   - Definition list (leading ';').
  45 #
  46 # 2012-12-26 Allan Wegan <allanwegan@allanwegan.de>
  47 #
  48 
  49 import glob
  50 import os
  51 import platform
  52 import sys
  53 import unicodedata
  54 
  55 blacklist = (
  56   r'HilfeZurCreoleSyntax.txt',
  57   r'.')
  58 
  59 # Finds all occurences of a regular expression pattern in given :
  60 import re
  61 class ReCache:
  62   
  63   cache = dict()
  64   
  65   def compile(self, pattern, flags = 0):
  66     cache = self.cache
  67     if pattern not in cache:
  68       cache[pattern] = dict()
  69     cache = cache[pattern]
  70     if flags not in cache:
  71       cache[flags] = re.compile(pattern, flags)
  72     return cache[flags]
  73   def finditer(self, pattern, text, flags = 0):
  74     return self.compile(pattern, flags).finditer(text)
  75   def match(self, pattern, text, flags = 0):
  76     return self.compile(pattern, flags).match(text)
  77   def search(self, pattern, text, flags = 0):
  78     return self.compile(pattern, flags).search(text)
  79   def sub(self, pattern, replacement, text, flags = 0):
  80     return self.compile(pattern, flags).sub(replacement, text)
  81 
  82 # Outputs found anomalies:
  83 import urllib.parse
  84 class AnomalyOutputter:
  85    
  86   o = None
  87   e = None
  88   d = None
  89   qoute = r'"'
  90   ellipsis = r'…'
  91   sol = r'|'
  92   eol = r'|'
  93   maxPartLength = 70
  94   minAfterLength = 20
  95   
  96   pathCount = 0
  97   lineCount = 0
  98   anomalyCount = 0
  99   anomalyCounts = dict()
 100   lastPath = r''
 101   lastLineNr = 0
 102   
 103   def __init__(self, outputStream, textEscaper, textDecorator):
 104     self.o = outputStream
 105     self.e = textEscaper
 106     self.d = textDecorator
 107   
 108   def out(self, path, lineNr, startColumn, endColumn, line, anomaly):
 109     o = self.o
 110     e = self.e
 111     d = self.d
 112     q = self.qoute
 113     if self.lastPath != path:
 114       self.lastPath = path
 115       self.pathCount += 1
 116       ePath = d.decorateText(e.escape(path), d.textBCyan)
 117       pageName = path.replace(r' - ', r'/')
 118       if pageName[-4:] == r'.txt':
 119         pageName = pageName[0:-4]
 120       url = r'https://larpwiki.de/' + urllib.parse.quote(pageName)
 121       eUrl = d.decorateText(url, d.textWhite)
 122       o.write('\n%s%s%s\n<%s>:\n' % (q, ePath, q, eUrl))
 123     if self.lastLineNr != lineNr:
 124       if self.lastLineNr != lineNr:
 125         self.lineCount += 1
 126         self.lastLineNr = lineNr
 127       eLineNr = d.decorateText(str(lineNr + 1), d.textBYellow)
 128       o.write('  Line %s:\n' % (eLineNr))
 129     self.anomalyCount += 1
 130     if anomaly not in self.anomalyCounts:
 131       self.anomalyCounts[anomaly] = 1
 132     else:
 133       self.anomalyCounts[anomaly] += 1
 134     eColumn = d.decorateText(str(startColumn + 1), d.textBYellow)
 135     
 136     ml = self.maxPartLength
 137     # Extract as much of the anomaly as allowed and selected:
 138     t = e.escapeLimitRight(line[startColumn:endColumn], ml)
 139     part = t[0]
 140     partCpLength = t[1]
 141     partComplete = ((endColumn - startColumn - partCpLength) == 0)
 142     ml = max(0, ml - len(part))
 143     # Extract leading text but reserve some quota for trailing:
 144     if partComplete:
 145       mal = min(len(line) - endColumn, ml, int(ml / 2), self.minAfterLength)
 146     else:
 147       mal = 0
 148     bLength = min(startColumn, ml - mal)
 149     t = e.escapeLimitLeft(line[:startColumn], bLength)
 150     before = t[0]
 151     beforeCpLength = t[1]
 152     ml = max(0, ml - len(before))
 153     # Extract as much of trailing text as available and quota left:
 154     if partComplete:
 155       t = e.escapeLimitRight(line[endColumn:], ml)
 156       after = t[0]
 157       afterCpLength = t[1]
 158     else:
 159       after = r''
 160       afterCpLength = 0
 161     
 162     if startColumn - beforeCpLength > 0:
 163       sol = self.ellipsis
 164     else:
 165       sol = self.sol
 166     if (startColumn + partCpLength + afterCpLength) < len(line):
 167       eol = self.ellipsis
 168     else:
 169       eol = self.eol
 170     before = d.decorateText(before, d.textYellow)
 171     part = d.decorateText(part, d.textBYellow, d.textUnderline)
 172     after = d.decorateText(after, d.textYellow)
 173     o.write('    Column %s, anomaly %s%s%s:\n' % (eColumn, q, anomaly, q))
 174     o.write('      %s%s%s%s%s%s%s\n' % (sol, q, before, part, after, q, eol))
 175 
 176 # Colorizes output for ANSI terminals:
 177 class AnsiTextDecorator:
 178   
 179   textBlack = r'30'
 180   textRed = r'31'
 181   textGreen = r'32'
 182   textYellow = r'33'
 183   textBlue = r'34'
 184   textMagenta = r'35'
 185   textCyan = r'36'
 186   textGrey = r'37'
 187   textBGrey = r'30;1'
 188   textBRed = r'31;1'
 189   textBGreen = r'32;1'
 190   textBYellow = r'33;1'
 191   textBBlue = r'34;1'
 192   textBMagenta = r'35;1'
 193   textBCyan = r'36;1'
 194   textWhite = r'37;1'
 195   textBold = r'1'
 196   textItalic = r'3'
 197   textUnderline = r'4'
 198   backgroundBlack = r'40'
 199   backgroundRed = r'41'
 200   backgroundGreen = r'42'
 201   backgroundYellow = r'43'
 202   backgroundBlue = r'44'
 203   backgroundMagenta = r'45'
 204   backgroundCyan = r'46'
 205   backgroundGrey = r'47'
 206   
 207   def decorateText(self, text, *codes):
 208     if not len(codes):
 209       return text
 210     codesString = r''
 211     for code in codes:
 212       codesString += '\x1B[' + code + r'm'
 213     return codesString + text + '\x1B[0m'
 214 
 215 class dummyTextDecorator(AnsiTextDecorator):
 216   def decorateText(self, text, *codes):
 217     return text
 218 
 219 # Escapes non-printable code points except space (0x20) in given text:
 220 from io import StringIO
 221 class TextEscaper:
 222   
 223   def escape(self, text):
 224     if not len(text): return r''
 225     return repr(text)[1:-1].replace(r'"', r'\"')
 226 
 227   def escapeLimitRight(self, text, maxLength):
 228     if maxLength <= 0: return (r'', 0)
 229     buffer = StringIO()
 230     length = 0
 231     cpCount = 0
 232     for cp in text:
 233       cp = self.escape(cp)
 234       newLength = length + len(cp)
 235       if newLength > maxLength:
 236         break
 237       buffer.write(cp)
 238       cpCount += 1
 239       length = newLength
 240       if length == maxLength:
 241         break
 242     return (buffer.getvalue(), cpCount)
 243   
 244   def escapeLimitLeft(self, text, maxLength):
 245     if maxLength <= 0: return (r'', 0)
 246     cpList = []
 247     length = 0
 248     index = len(text)
 249     while index > 0:
 250       index -= 1
 251       cp = self.escape(text[index])
 252       newLength = length + len(cp)
 253       if newLength > maxLength:
 254         break
 255       cpList.append(cp)
 256       length = newLength
 257       if length == maxLength:
 258         break
 259     cpList.reverse()
 260     buffer = StringIO()
 261     for cp in cpList:
 262       buffer.write(cp)
 263     return (buffer.getvalue(), len(cpList))
 264 
 265 o = sys.stdout
 266 e = TextEscaper()
 267 if o.isatty() and (platform.system() != r'Windows'):
 268   d = AnsiTextDecorator()
 269   import subprocess
 270   cols = int(subprocess.Popen(('tput', 'cols'), 
 271     stdout=subprocess.PIPE).stdout.read())
 272   if cols <= 0:
 273     cols = 80
 274 else:
 275   d = dummyTextDecorator()
 276   cols = 80
 277 ao = AnomalyOutputter(o, e, d)
 278 ao.maxPartLength = cols - 11
 279 rec = ReCache()
 280 
 281 # Test all *.txt files:
 282 o.write('Scanning files...\n')
 283 fileCount = 0
 284 blistedCount = 0
 285 paths = glob.iglob(r"*.txt")
 286 for path in paths:
 287   if not os.path.isfile(path):
 288     continue
 289   if path in blacklist:
 290     blistedCount += 1
 291     continue
 292   fileCount += 1
 293   file = open(path, 'r')
 294   lineNr = -1
 295   firstDirectiveLine = 1
 296   validRedirectPresent = False
 297   for line in file:
 298     line = rec.sub("\n$", r'', line)
 299     lineNr += 1
 300     commentLine = (rec.match(r'##+\s', line) != None)
 301     directiveLine = not commentLine and (rec.match(r'#[^#]', line) != None)
 302     
 303     # Obscure code points:
 304     markAllowed = False
 305     lineLength = len(line)
 306     for lineIndex, cp in enumerate(line):
 307       anomaly = False
 308       unexpectedMark = False
 309       cpCat = unicodedata.category(cp)
 310       cpCatMain = cpCat[0]
 311       
 312       if cpCatMain == r'L':
 313         markAllowed = True
 314         continue
 315       
 316       if cpCatMain != r'M':
 317         markAllowed = False
 318       
 319       if cp == r'�': # REPLACEMENT CHARACTER, category So
 320         anomaly = True
 321       
 322       if not anomaly:
 323         if cpCatMain in (r'N', r'P', r'S', r'Z') or cp in (
 324           '\t', 
 325           '\xad', # SOFT HYPHEN, category Cf
 326           '\u200d', # ZERO WIDTH JOINER, category Cf
 327           '\u200e', # LEFT-TO-RIGHT MARK, category Cf
 328           None):
 329           continue
 330         if cpCatMain in (r'M'): # Special handling for marks.
 331           if markAllowed:
 332             continue
 333           # Not in letter cluster.
 334           anomaly = True
 335           unexpectedMark = True
 336       
 337       # @Todo: There are legitimate code points for RTL-languages in Cf.
 338       
 339       # Handle anomaly:
 340       before = line[max(0, lineIndex - 30):lineIndex]
 341       after = line[lineIndex + 1:lineIndex + 31]
 342       cpName = unicodedata.name(cp, r'unnamed')
 343       if unexpectedMark:
 344         suffix = r' not preceded by a letter'
 345       else:
 346         suffix = r''
 347       ao.out(path, lineNr, lineIndex, lineIndex + 1, line,
 348         (r'Unicode %s (%s, category %s)' + suffix)
 349         % (e.escape(cp), cpName, cpCat))
 350     
 351     # Old-wiki-style lists:
 352     match = rec.match(r'(\*|#(\*|#(\*|#)))[*#]*', line)
 353     if match:
 354       directiveLine = False
 355       commentLine = False
 356       start = match.start()
 357       end = match.end()
 358       ao.out(path, lineNr, start, end, line, r'Old wiki list')
 359 
 360     # No further wiki syntax checks for comments or after valid redirects:
 361     if commentLine:
 362       continue
 363     
 364     # Determine first directive line
 365     if (firstDirectiveLine == lineNr) and commentLine:
 366       firstDirectiveLine += 1
 367 
 368     # Detect extra non-comment markup after valid redirect:
 369     if validRedirectPresent and not directiveLine:
 370       match = rec.match(r'\s*(\S.*?)\s*$', line)
 371       if match:
 372         start = match.start(1)
 373         end = match.end(1)
 374         ao.out(path, lineNr, start, end, line,
 375           r'Non-empty non-comment line after valid redirect')
 376         continue
 377 
 378     # Detect redirects:
 379     match = rec.match(r'#REDIRECT(\s*)(?P<name>.*)', line)
 380     if match:
 381       if firstDirectiveLine:
 382         name = match.group(r'name')
 383         if not name:
 384           ao.out(path, lineNr, 0, len(line), line, r'Redirect without target')
 385         else:
 386           validRedirectPresent = True
 387       else:
 388         ao.out(path, lineNr, 0, len(line), line, r'Redirect in non-first line')
 389       continue
 390 
 391     # Skip other directives:
 392     if directiveLine:
 393       continue
 394 
 395     # Old-wiki-style features dependent on first char of line:
 396     match = rec.match(r'''^(?P<firstChar>[:;])((?P<extraChars>[:;]*)|($|
 397       [^-\(\{\[\|\)\}\]pPD] # Do not match smilies.
 398       ))''', line, re.VERBOSE)
 399     if match:
 400       linePartPos = match.start()
 401       linePart = match.group()
 402       firstChar = match.group(r'firstChar')
 403       extraCount = len(match.group(r'extraChars'))
 404       end = 1 + extraCount
 405       if firstChar == r':':
 406         ao.out(path, lineNr, 0, end, line, r'Old wiki indenting')
 407         continue
 408       if firstChar == r';':
 409         ao.out(path, lineNr, 0, end, line, r'Old wiki definition list')
 410         continue
 411 
 412     # Old wiki tags:
 413     matches = rec.finditer(r'''<(?P<close>[/]?)(?P<name>(
 414       b|i|nowiki|pre|toc|tt
 415       ))>''', line, re.IGNORECASE | re.VERBOSE)
 416     for match in matches:
 417       start = match.start()
 418       end = match.end()
 419       closing = match.group(r'close')
 420       tagName = match.group(r'name')
 421       if closing:
 422         tagType = r'close'
 423       else:
 424         tagType = r'open'
 425       ao.out(path, lineNr, start, end, line, r'Old wiki tag %s %s'
 426         % (tagName, tagType))
 427 
 428     # <<BR>> tags (old and new):
 429     matches = rec.finditer(r'''
 430       (?P<open><[<`]*)
 431       (?P<name>br)
 432       (?P<close>[>`]*>)
 433       ''', line, re.IGNORECASE | re.VERBOSE)
 434     for match in matches:
 435       start = match.start()
 436       end = match.end()
 437       linePart = match.group()
 438       tagOpen = match.group(r'open')
 439       tagName = match.group(r'name')
 440       tagClose = match.group(r'close')
 441       if (tagOpen == '<') and (tagClose == '>'):
 442         ao.out(path, lineNr, start, end, line, r'Old wiki linebreak')
 443         continue
 444       if (tagOpen == '<<') and (tagClose[0:2] == '>>') and (tagName != 'BR'):
 445         ao.out(path, lineNr, start, end, line, r'Invalid linebreak')
 446         continue
 447     
 448     # Headlines:
 449     matches = rec.finditer(r'''^
 450       (?P<spaceBeforOpen>\s*) # Illegal.
 451       (?P<openTag>[=]+) # Headline open tag.
 452       (?P<spaceAfterOpen>\s*) # Required.
 453       (?P<nIndicator>[\#*]*)\s* # Numbering indicator from old wiki.
 454       (?P<text>.*?) # Required headline text (non-greedy).
 455       (?P<spaceBeforClose>\s*) # Required.
 456       (?P<closeTag>[=]*) # Has to be same as open tag.
 457       (?P<spaceAfterClose>\s*) # Illegal trailing whitespace.
 458       $''', line, re.VERBOSE)
 459     for match in matches:
 460       linePartPos = match.start()
 461       linePart = match.group()
 462       spaceBeforOpen = match.group(r'spaceBeforOpen')
 463       openTag = match.group(r'openTag')
 464       openTagStart = match.start(r'openTag')
 465       openTagEnd = match.end(r'openTag')
 466       spaceAfterOpen = match.group(r'spaceAfterOpen')
 467       nIndicator = match.group(r'nIndicator')
 468       text = match.group(r'text')
 469       spaceBeforClose = match.group(r'spaceBeforClose')
 470       closeTag = match.group(r'closeTag')
 471       spaceAfterClose = match.group(r'spaceAfterClose')
 472       if spaceBeforOpen:
 473         end = len(spaceBeforOpen)
 474         ao.out(path, lineNr, 0, end, line, r'Headline starts with whitespace')
 475       if len(openTag) > 5:
 476         start = openTagStart
 477         end = openTagEnd
 478         ao.out(path, lineNr, start, end, line, r'Headline of level > 5')
 479       if not text:
 480         end = len(line)
 481         start = openTagEnd - 1
 482         ao.out(path, lineNr, start, end, line, r'Headline contains no text')
 483         continue
 484       else:
 485         iMatches = rec.finditer(r"[`']{2,}", text)
 486         for iMatch in iMatches:
 487           start = match.start(r'text') + iMatch.start()
 488           end = match.start(r'text') + iMatch.end()
 489           ao.out(path, lineNr, start, end, line,
 490             r'Headline text contains markup')
 491       if not spaceAfterOpen:
 492         if nIndicator:
 493           start = match.start(r'nIndicator')
 494         else:
 495           start = match.start(r'text')
 496         ao.out(path, lineNr, start, start + 1, line,
 497           r'Headline without whitespace after open tag')
 498       if nIndicator:
 499         start = match.start(r'nIndicator')
 500         end = match.end(r'nIndicator')
 501         ao.out(path, lineNr, start, end, line,
 502           r'Headline with old numbering indicator')
 503       if not closeTag:
 504         ao.out(path, lineNr, len(line) - 1, len(line), line,
 505           r'Headline without close tag')
 506         continue # Skip following checks when no close tag present.
 507       if len(openTag) != len(closeTag):
 508         start = match.start(r'closeTag')
 509         end = match.end(r'closeTag')
 510         ao.out(path, lineNr, start, end, line,
 511           r'Headline with different length open and close tags')
 512       if not spaceBeforClose:
 513         start = match.start(r'closeTag')
 514         ao.out(path, lineNr, start, start + 1, line,
 515           r'Headline without whitespace before close tag')
 516       if spaceAfterClose:
 517         start = match.start(r'spaceAfterClose')
 518         end = match.end(r'spaceAfterClose')
 519         ao.out(path, lineNr, start, end, line, r'Headline ends with whitespace')
 520         
 521     # Links:
 522     matches = rec.finditer(r'''
 523       (?P<openBrackets>\[[\[`]*) # Link open brackets (2 for valid links).
 524       (?P<openQuote>"?) # Artifact from old wiki conversion.
 525       \s*
 526       (?P<linkUrl>.*?) # Link URL (not greedy).
 527       \s*
 528       (?P<closeQuote>"?) # Artifact from old wiki conversion.
 529       (?P<closeBrackets>[\]`]*\]) # Link open brackets (2 for valid links).
 530       ''', line, re.IGNORECASE | re.VERBOSE)
 531     for match in matches:
 532       start = match.start()
 533       end = match.end()
 534       linePart = match.group()
 535       openBrackets = match.group(r'openBrackets')
 536       openQuote = match.group(r'openQuote')
 537       linkUrl = match.group(r'linkUrl')
 538       closeQuote = match.group(r'closeQuote')
 539       closeBrackets = match.group(r'closeBrackets')
 540       if openQuote:
 541         ao.out(path, lineNr, start, end, line,
 542           r'Fail-converted unnamed internal link')
 543         continue
 544       if (len(openBrackets) == 1) and rec.search(r':', linkUrl):
 545         ao.out(path, lineNr, start, end, line,
 546           r'Fail-converted external link')
 547         continue
 548     
 549     # Old wiki uploads:
 550     matches = rec.finditer(r'(^|\s)(?P<link>upload:\S+)(\s|$)', line, re.I)
 551     for match in matches:
 552       start = match.start(r'link')
 553       end = match.end(r'link')
 554       ao.out(path, lineNr, start, end, line,
 555         r'Old wiki upload link')
 556 
 557   file.close()
 558 eFileCount = d.decorateText(str(fileCount), d.textBYellow)
 559 eBlistedCount = d.decorateText(str(blistedCount), d.textBYellow)
 560 if ao.anomalyCount:
 561   eAnomalyCount = d.decorateText(str(ao.anomalyCount), d.textBYellow)
 562   eLineCount = d.decorateText(str(ao.lineCount), d.textBYellow)
 563   ePathCount = d.decorateText(str(ao.pathCount), d.textBYellow)
 564   o.write(('\nFound %s anomalies in %s lines from %s files'
 565     + ' (%s scanned, %s excluded):\n')
 566     % (eAnomalyCount, eLineCount, ePathCount, eFileCount, eBlistedCount))
 567   anomalyCounts = ao.anomalyCounts
 568   maxValue = sorted(anomalyCounts.values())[-1]
 569   format = r'%' + repr(len(repr(maxValue))) + r'i';
 570   keys = sorted(anomalyCounts.keys())
 571   for key in keys:
 572     eCount = d.decorateText(format % (anomalyCounts[key]), d.textBYellow)
 573     o.write('  %s  %s\n' % (eCount, key))
 574 else:
 575   o.write('\nFound no anomalies in %i files (%s excluded).\n'
 576     % (fileCount, eBlistedCount))
Neuer Dateianhang

Gespeicherte Dateianhänge

Um Dateianhänge in eine Seite einzufügen sollte unbedingt eine Angabe wie attachment:dateiname benutzt werden, wie sie auch in der folgenden Liste der Dateien erscheint. Es sollte niemals die URL des Verweises ("laden") kopiert werden, da sich diese jederzeit ändern kann und damit der Verweis auf die Datei brechen würde.
attachment:larpWikiAnomalyScannerV1.py von LarpWikiAnomalyScanner

LARP

LarpWiki

Dateianhang 'larpWikiAnomalyScannerV1.py'

Neuer Dateianhang

Gespeicherte Dateianhänge