1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 import sys
26 import time
27 import re
28 import os.path
29 import ssl
30 from glob import iglob
31 import urllib.parse
32 import urllib.error
33 import urllib.request
34 from html.parser import HTMLParser
35
36 baseDir = os.path.dirname(__file__)
37
38 baseUrl = "https://www.larpwiki.de/"
39 caChainFile = os.path.join(baseDir, "www.larpwiki.de.pem")
40 indexPage = "Admin/SiteIndex"
41
42 netRetryM = 0x7FFFFFFF
43 netRetryD = 1.0
44 netRetryDFun = lambda oldDelay: oldDelay * 2.0
45
46 class UrlNotFound(Exception): pass
47
48
49 def retryOnError(fun, retriesMax, retryDelay, retryDelayFun):
50 while True:
51
52 try:
53 return fun()
54 except KeyboardInterrupt:
55 raise
56 except Exception as ex:
57 if isinstance(ex, urllib.error.HTTPError) and ex.code == 404:
58 raise UrlNotFound(ex)
59 if retriesMax < 1: raise
60 print("Error:", sys.exc_info())
61 print("Waiting %f seconds before retrying (Retries left: %i)..." % (
62 retryDelay, retriesMax
63 ))
64 time.sleep(retryDelay)
65 retriesMax -= 1
66 retryDelay = retryDelayFun(retryDelay)
67 continue
68
69
70 def writeFile(path, content):
71 openMode = 'w' if isinstance(content, str) else 'wb'
72 with open(path, openMode) as fd:
73 fd.write(content)
74
75 def createSiteUrl(pageUrl, getRaw):
76 tpl = "{baseUrl}{pageUrl}{rawArgs}"
77 rawArgs = "?action=raw" if getRaw else ""
78 return tpl.format(baseUrl=baseUrl, pageUrl=pageUrl, rawArgs=rawArgs)
79
80 def createAttachmentUrl(pageUrl, name):
81 tpl = "{baseUrl}{pageUrl}?action=AttachFile&do=get&target={name}"
82 name = urllib.parse.quote(name, safe='')
83 return tpl.format(baseUrl=baseUrl, pageUrl=pageUrl, name=name)
84
85 def getSslContext():
86 """
87 Python may or may not fail to find system certificates based on version,
88 distribution and user-side configuration.
89 This should be able to work around that on almost any Unix or Linux.
90 """
91
92 certfiles = (
93
94 "/etc/ssl/certs/ca-certificates.crt",
95 "/etc/pki/tls/certs/ca-bundle.crt",
96 "/etc/ssl/ca-bundle.pem",
97 "/etc/pki/tls/cacert.pem",
98 "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",
99
100 "/etc/ssl/certs",
101 "/system/etc/security/cacerts",
102 "/usr/local/share/certs",
103 "/etc/pki/tls/certs",
104 "/etc/openssl/certs",
105 )
106 sslCtx = ssl.create_default_context()
107 for capath in certfiles:
108 if sslCtx.cert_store_stats()['x509_ca'] != 0:
109 break
110 if os.path.isdir(capath):
111 paths = iglob(os.path.join(capath,"*"))
112 elif os.path.isfile(capath):
113 paths = (capath,)
114 else:
115 paths = ()
116 for path in paths:
117
118 try:
119 sslCtx.load_verify_locations(cafile=path)
120 except Exception:
121 pass
122
123 if sslCtx.cert_store_stats()['x509_ca'] == 0:
124 raise Exception("System CA certificates not found!")
125 return sslCtx
126
127 def getUrlContent(sslContext, url):
128 def f():
129 return urllib.request.urlopen(url, context=sslContext)
130 response = retryOnError(f, netRetryM, netRetryD, netRetryDFun)
131 return response.read()
132
133 def getPageText(sslContext, url):
134 return getUrlContent(sslContext, url).decode("utf-8", "replace")
135
136
137 class IndexPageParser(HTMLParser):
138
139 def __init__(self):
140 super().__init__()
141 self.pages = []
142 self._tags = [
143
144 ((True, 'h2', (('id', 'Wiki-Seiten'),), self._removeFirstFromStack),),
145 ((False, 'h2', (), self._removeFirstFromStack),),
146 ((True, 'ul', (), self._removeFirstFromStack),),
147 (
148 (False, 'ul', (), self._removeFirstFromStack),
149 (True, 'a', (), self._parsePageLinkOpen),
150 ),
151 ]
152
153
154 def _removeFirstFromStack(self, isOpen, tag, attrs):
155 del self._tags[0]
156
157
158 def _parsePageLinkOpen(self, isOpen, tag, attrs):
159 url = attrs['href']
160 if url[0:1] == '/':
161 url = url[1:]
162 self.pages.append(url)
163
164 def handleTag(self, isOpen, tag, attrs):
165 attrs = dict(attrs)
166 if len(self._tags) == 0:
167 return
168 for cOpen, cTag, cAttrs, onMatch in self._tags[0]:
169 if (cOpen is isOpen and tag == cTag
170 and all(attrs.get(k,None) == v for k,v in cAttrs)):
171 onMatch(isOpen, tag, attrs)
172 return
173
174 def handle_starttag(self, tag, attrs):
175 attrs = dict(attrs)
176 self.handleTag(True, tag, attrs)
177
178 def handle_endtag(self, tag):
179 self.handleTag(False, tag, {})
180
181 def extractAattachmentNames(text):
182 attachmentRe = re.compile(r'''(?<=[[{]{2}attachment:)
183 [^]|}]+
184 (?=[]}]{2}|[|])''', re.VERBOSE | re.IGNORECASE)
185 return set(attachmentRe.findall(text))
186
187 def main():
188 sslContext = getSslContext()
189 if caChainFile is not None:
190 sslContext.load_verify_locations(cafile=caChainFile)
191
192 url = createSiteUrl(indexPage, False)
193 print("Getting page index from {0}...".format(url))
194 text = getPageText(sslContext, url)
195 parser = IndexPageParser()
196 parser.feed(text)
197 pages = parser.pages
198
199
200 slashRegExp = re.compile("/")
201 pLen = len(pages)
202 for pCount, pUrl in enumerate(sorted(pages), start=1):
203 pName = urllib.parse.unquote(pUrl)
204 pFileName = slashRegExp.sub(' - ', pName) + '.txt'
205 if os.path.isfile(pFileName):
206 msg = '{0}/{1} page "{2}" already exists.'
207 print(msg.format(pCount, pLen, pFileName))
208 else:
209 pRawUrl = createSiteUrl(pUrl, True)
210 print('{0}/{1} fetching page "{2}".'.format(pCount, pLen, pFileName))
211 print(' <{0}>'.format(pRawUrl))
212 try:
213 text = getPageText(sslContext, pRawUrl)
214 attachments = extractAattachmentNames(text)
215 aLen = len(attachments)
216 for aCount, aName in enumerate(sorted(attachments), start=1):
217 tpl = "{0}.txt - {1}"
218 aFileName = slashRegExp.sub(' - ', tpl.format(pName, aName))
219 if os.path.isfile(aFileName):
220 msg = '{0}/{1} attachment "{2}" already exists.'
221 print(msg.format(aCount, aLen, aName))
222 elif "/" in aName:
223 msg = '{0}/{1} attachment "{2}" is a reference.'
224 print(msg.format(aCount, aLen, aName))
225 else:
226 aUrl = createAttachmentUrl(pUrl, aName)
227 msg = '{0}/{1} fetching attachment "{2}".'
228 print(msg.format(aCount, aLen, aName))
229 print(' <{0}>'.format(aUrl))
230 try:
231 aBytes = getUrlContent(sslContext, aUrl)
232 writeFile(aFileName, aBytes)
233 except UrlNotFound:
234 print(' 404 - Not found!')
235 writeFile(pFileName, text)
236 except UrlNotFound:
237 print(' 404 - Not found!')
238 time.sleep(5.0)
239
240 if __name__ == "__main__":
241 main()