Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- """
- This program is just a simple demonstration how to
- process the EventStream of Wikipedia, which is public to everyone.
- Python >= 3.6 is needed.
- More information about EventStream:
- https://wikitech.wikimedia.org/wiki/Event_Platform/EventStreams
- Information about the json schema of recentchange version 2:
- https://github.com/wikimedia/mediawiki-event-schemas/blob/master/jsonschema/mediawiki/recentchange/2.yaml
- External dependencies are used, to reduce my own workload.
- aiohttp: https://github.com/aio-libs/aiohttp
- aiohttp is a dependency of aiosseclient
- aiosseclient: https://github.com/ebraminio/aiosseclient
- To install the dependencies:
- # in a virtual environment
- pip install pytz aiohttp
- pip install git+https://github.com/ebraminio/aiosseclient
- # or as a user, if you're not in a virtual environment
- pip3 install pytz aiohttp --user
- pip3 install git+https://github.com/ebraminio/aiosseclient --user
- The aiosseclient is a very simple implementation of a sse client.
- The stream sent by wikimedia, is a json string
- Python has a json parser in the standard library which is used.
- The urls are quoted, the unquote function make the urls readable
- Fix for ssl-error: https://github.com/aio-libs/aiohttp/issues/3535#issuecomment-483268542
- """
- import asyncio
- import json
- import csv
- from datetime import datetime as dt
- from urllib.parse import unquote, quote
- from pathlib import Path
- import pytz
- from aiosseclient import aiosseclient
- TIMEZONE = pytz.timezone('Europe/Berlin')
- # stream url to get information about recent changes
- STREAM_URL = 'https://stream.wikimedia.org/v2/stream/recentchange'
- # just to know which types are supported
- TYPES = ["edit", "new", "log","categorize", "external"]
- TYPES_LEN = len(max(TYPES, key=len))
- # https://pbs.twimg.com/media/D1cnO9KXgAAV6mf.jpg:large
- # funtionäre
- USERS = {'Hen3ry', 'Horst G.', 'Sargoth', 'XenonX3', 'Alraunenstern'}
- # admins
- USERS |= {
- 'Aspirinikis', 'Cymothoa', 'DaB', 'Doc Taxon', 'Drahreg01', 'Felistoria',
- 'Filzstift', 'Gereon K', 'JD', 'Kuebi', 'Krizolina', 'Kurator71',
- 'Lustiger Seth', 'Partynia', 'Seewolf', 'Sqasher', 'Stefan64', 'XenonX3'
- }
- # sichter
- USERS |= {
- 'Abutoum', 'Alnilam', 'Amilamia', 'Andropov', 'Anidaat', 'AnnaS.',
- 'Atomiccocktail', 'Benatrevqre', 'Berichtbestatter', 'David Navan',
- 'Density', 'Elektrofisch', 'EH42', 'Feliks', 'Fiona B.', 'Fossa',
- 'Framhein', 'Gabel1960', 'Gonzo Lubitsch', 'Gustav v.A.', 'Hardenacke',
- 'Hot Gadling', 'Hvd69', 'In dubio pro dubio', 'JensB.', '-jkb-', 'Jmb1982',
- 'Jonaster', 'JosFritz', 'KaiMarting', 'KarlV', 'Kopilot', 'KurtR',
- 'Liberaler Humanist', 'Marcus C.', 'MBurch', 'Miltrak', 'Orik',
- 'Perfect Tommy', 'Phi', 'Port(u*o)s', 'SanFran Farmer', 'Sänger',
- 'Schwarze Feder', 'Simplicicus', 'Thoma', 'Toter alter Mann', 'Über-Blick',
- 'Viciarg', 'Webverbesserer'
- }
- def ignore_aiohttp_ssl_eror(loop, aiohttpversion='3.5.4'):
- """Ignore aiohttp #3535 issue with SSL data after close
- There appears to be an issue on Python 3.7 and aiohttp SSL that throws a
- ssl.SSLError fatal error (ssl.SSLError: [SSL: KRB5_S_INIT] application data
- after close notify (_ssl.c:2609)) after we are already done with the
- connection. See GitHub issue aio-libs/aiohttp#3535
- Given a loop, this sets up a exception handler that ignores this specific
- exception, but passes everything else on to the previous exception handler
- this one replaces.
- If the current aiohttp version is not exactly equal to aiohttpversion
- nothing is done, assuming that the next version will have this bug fixed.
- This can be disabled by setting this parameter to None
- """
- orig_handler = loop.get_exception_handler() or loop.default_exception_handler
- def ignore_ssl_error(loop, context):
- if context.get('message') == 'SSL error in data received':
- # validate we have the right exception, transport and protocol
- exception = context.get('exception')
- protocol = context.get('protocol')
- if (
- isinstance(exception, ssl.SSLError) and exception.reason == 'KRB5_S_INIT' and
- isinstance(protocol, asyncio.sslproto.SSLProtocol) and
- isinstance(protocol._app_protocol, aiohttp.client_proto.ResponseHandler)
- ):
- if loop.get_debug():
- asyncio.log.logger.debug('Ignoring aiohttp SSL KRB5_S_INIT error')
- return
- orig_handler(context)
- async def event_generator(stream_url):
- """
- Infinite async stream generator.
- All errors are silently thrown away.
- If you use this function, just look what
- happens, if you got a timeout.
- The exception(s) should catched explicit.
- """
- while True:
- client = aiosseclient(stream_url)
- try:
- async for event in client:
- yield json.loads(event.data)
- except Exception as e:
- # should be more specific
- pass
- def selector(event, **kwargs):
- """
- This selector returns True, if all values are equal
- to the values of the event. If the value is in callable,
- it's called with the value of the event-field. If the
- result is False, the function returns also False.
- """
- for key, value in kwargs.items():
- event_value = event.get(key)
- if callable(value):
- if not value(event_value):
- return False
- else:
- if value != event_value:
- return False
- return True
- def formatter(event):
- """
- Formatting the text and return it together
- with the fields in a tuple.
- """
- uri = unquote(event['meta']['uri'])
- user = event['user']
- revision = event.get('revision')
- timestamp = dt.fromisoformat(event['meta']['dt']).astimezone(TIMEZONE)
- if revision and 'old' in revision and 'new' in revision:
- old = revision['old']
- new = revision['new']
- title = quote(event['title'])
- uri = f'https://de.wikipedia.org/w/index.php?title={title}&type=revision&diff={new}&oldid={old}'
- if len(user) > 15:
- user = user[:15-3]
- user += '...'
- title = event['title']
- typ = event['type']
- if len(title) > 30:
- title = title[:30-3]
- title += '...'
- # a little bit black format magic
- text = f'{timestamp.time()!s} | {user:<15} | {title.title():<30} | {typ:<{TYPES_LEN}} | {uri}'
- return text, (timestamp.isoformat(), user, event['title'], typ, uri)
- async def main():
- """
- Main programm.
- Prepares the filters, iterating over the stream
- and print output to the console.
- """
- # only stupid german servers
- server_name = lambda ev: ev.startswith('de.')
- # filter for users
- users = lambda user: user in USERS
- # users = lambda user: True
- # filter for different types
- types = lambda typ: typ in ['edit', 'categorize', 'new']
- # instanciate the event_generator
- stream = event_generator(STREAM_URL)
- # set the options for the filter
- options = {'bot': False, 'server_name': server_name, 'type': types, 'user': users}
- # iterate over the stream
- csv_file = Path('wikispy.csv')
- if csv_file.exists():
- write_header = False
- else:
- write_header = True
- with open(csv_file, 'a') as fd:
- writer = csv.writer(fd, delimiter=';')
- if write_header:
- writer.writerow(['timestamp', 'user', 'title', 'type', 'uri'])
- async for event in stream:
- # if the event matches the filter, it's printed
- if selector(event, **options):
- text, fields = formatter(event)
- # print text, then write to file
- print(text)
- writer.writerow(fields)
- # fields of an event
- # meta has a subdict with more information
- # ['bot', 'comment', 'id', 'length', 'meta', 'minor', 'namespace',
- # 'parsedcomment', 'patrolled', 'revision', 'server_name',
- # 'server_script_path', 'server_url', 'timestamp', 'title',
- # 'type', 'user', 'wiki']
- """
- The resulting output:
- 18:34:22 | JosFritz | Benutzer Diskussion:Andropov | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AAndropov&type=revision&diff=188219653&oldid=188219250
- 18:34:25 | Kurator71 | Benutzer:Kurator71/Erzurum ... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188219655&oldid=188218480
- 18:43:10 | Phi | Ns-Staat | edit | https://de.wikipedia.org/w/index.php?title=NS-Staat&type=revision&diff=188219980&oldid=188216757
- 18:45:44 | Sänger | Benutzer Diskussion:Sänger | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220046&oldid=188219908
- 18:48:03 | Kurator71 | Benutzer:Kurator71/Erzurum ... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220107&oldid=188219655
- 18:48:03 | Kurator71 | Kategorie:Türkischer Befrei... | categorize | https://de.wikipedia.org/wiki/Kategorie:Türkischer_Befreiungskrieg
- 18:48:03 | Kurator71 | Kategorie:Türkische Militär... | categorize | https://de.wikipedia.org/wiki/Kategorie:Türkische_Militärgeschichte
- 18:48:03 | Kurator71 | Kategorie:Mustafa Kemal Ata... | categorize | https://de.wikipedia.org/wiki/Kategorie:Mustafa_Kemal_Atatürk
- 18:48:03 | Kurator71 | Kategorie:Erzurum | categorize | https://de.wikipedia.org/wiki/Kategorie:Erzurum
- 18:48:54 | Kurator71 | Benutzer:Kurator71/Erzurum ... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220131&oldid=188220107
- 18:50:11 | Partynia | Benutzer:Partynia | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia&type=revision&diff=188220167&oldid=188213600
- 18:51:26 | JosFritz | Benutzer Diskussion:Andropov | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AAndropov&type=revision&diff=188220201&oldid=188220159
- 18:51:53 | Kurator71 | Benutzer:Kurator71/Erzurum ... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220217&oldid=188220131
- 18:53:08 | Kurator71 | Erzurum-Kongress | edit | https://de.wikipedia.org/w/index.php?title=Erzurum-Kongress&type=revision&diff=188220253&oldid=188220224
- 18:53:32 | Kurator71 | Erzurum-Kongress | edit | https://de.wikipedia.org/w/index.php?title=Erzurum-Kongress&type=revision&diff=188220265&oldid=188220253
- 18:54:13 | Kurator71 | Benutzer:Kurator71/Erzurum ... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220283&oldid=188220225
- 18:54:13 | Kurator71 | Kategorie:Wikipedia:Schnell... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Schnelllöschen
- 18:54:58 | Kurator71 | Wikipedia:Wikimedia Cee Spr... | edit | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AWikimedia%20CEE%20Spring%202019&type=revision&diff=188220305&oldid=188217610
- 18:56:37 | Partynia | Benutzer:Partynia/Gesundhei... | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia/Gesundheitswesen%20Neu&type=revision&diff=188220351&oldid=187751173
- 18:56:43 | Phi | Diskussion:Sven Felix Kelle... | edit | https://de.wikipedia.org/w/index.php?title=Diskussion%3ASven%20Felix%20Kellerhoff&type=revision&diff=188220354&oldid=188210108
- 18:57:50 | Hardenacke | Günther Stempel | edit | https://de.wikipedia.org/w/index.php?title=G%C3%BCnther%20Stempel&type=revision&diff=188220383&oldid=188114758
- 18:57:50 | Phi | Diskussion:Sven Felix Kelle... | edit | https://de.wikipedia.org/w/index.php?title=Diskussion%3ASven%20Felix%20Kellerhoff&type=revision&diff=188220384&oldid=188220354
- 18:58:24 | Phi | Oxymoron | edit | https://de.wikipedia.org/w/index.php?title=Oxymoron&type=revision&diff=188220403&oldid=188220369
- 18:59:04 | -jkb- | Jan Líbezný | edit | https://de.wikipedia.org/w/index.php?title=Jan%20L%C3%ADbezn%C3%BD&type=revision&diff=188220423&oldid=187718616
- 18:59:04 | -jkb- | Kategorie:Wikipedia:Defekte... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Defekte_Weblinks/Ungeprüfte_Archivlinks_2018-04
- 18:59:11 | Partynia | Benutzer:Partynia/Geschichte | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia/Geschichte&type=revision&diff=188220427&oldid=187217751
- 19:03:22 | -jkb- | Jüdische Gemeinde Loštice | edit | https://de.wikipedia.org/w/index.php?title=J%C3%BCdische%20Gemeinde%20Lo%C5%A1tice&type=revision&diff=188220528&oldid=187767523
- 19:03:22 | -jkb- | Kategorie:Wikipedia:Weblink... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Weblink_offline
- 19:03:22 | -jkb- | Kategorie:Wikipedia:Weblink... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Weblink_offline_IABot
- 19:03:22 | -jkb- | Kategorie:Wikipedia:Defekte... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Defekte_Weblinks/Ungeprüfte_Botmarkierungen_2018-04
- 19:06:02 | JosFritz | Benutzerin Diskussion:Josfritz | edit | https://de.wikipedia.org/w/index.php?title=Benutzerin%20Diskussion%3AJosFritz&type=revision&diff=188220602&oldid=188211731
- 19:08:18 | Sänger | Benutzer Diskussion:Sänger | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220646&oldid=188220617
- 19:14:05 | Phi | Wikipedia:Review/Geschichte | edit | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AReview/Geschichte&type=revision&diff=188220781&oldid=188220559
- 19:22:56 | Sänger | Benutzer Diskussion:Sänger | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220979&oldid=188220944
- 19:24:08 | Phi | Diskussion:Adolf Hitlers Mö... | edit | https://de.wikipedia.org/w/index.php?title=Diskussion%3AAdolf%20Hitlers%20m%C3%B6gliche%20Monorchie&type=revision&diff=188221011&oldid=188220836
- 19:25:29 | JosFritz | Benutzer Diskussion:Sänger | edit | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188221065&oldid=188221024
- 19:26:44 | Sänger | Wikipedia:Vandalismusmeldung | edit | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AVandalismusmeldung&type=revision&diff=188221097&oldid=188220985
- 19:28:53 | Über-Blick | Christlich Demokratische Un... | edit | https://de.wikipedia.org/w/index.php?title=Christlich%20Demokratische%20Union%20Deutschlands&type=revision&diff=188221145&oldid=188173337
- """
- if __name__ == '__main__':
- loop = asyncio.get_event_loop()
- loop.set_exception_handler(ignore_aiohttp_ssl_eror)
- try:
- loop.run_until_complete(main())
- loop.run_forever()
- except KeyboardInterrupt:
- loop.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement