Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- """
- Specialized Html2Json converter.
- Does only work with the given format.
- Html2Json reads by default from stdin
- If a terminal is detected, the filename be
- the first argument.
- """
- import json
- import sys
- from argparse import ArgumentParser
- from collections import deque
- from pathlib import Path
- from typing import Union
- from bs4 import BeautifulSoup
- example_html = """<table>
- <tbody>
- <tr></tr>
- <tr>
- <th>1 abc</th>
- <td>good</td>
- <td><a href="/good">John (Nick)</a></td>
- <td>Lincoln</td>
- </tr>
- <tr>
- <th>20 xyz</th>
- <td>bad</td>
- <td><a href="/bad">Emma</a></td>
- <td>Smith</td>
- </tr>
- <tr></tr>
- ...
- </tbody>
- </table>"""
- def html2json(html: str, debug: bool = False):
- """
- Converts html to a json str.
- """
- collections = {'collections': []}
- result = collections['collections']
- # result has the same reference of the list object
- # inside the list
- fields = 'size identity state name nick'.split()
- skip_fields = 'state nick'.split()
- bs = BeautifulSoup(html, features='html.parser')
- for tr in bs.find_all('tr'):
- state = deque()
- for th_td in tr.children:
- if hasattr(th_td, 'text'):
- state.append(th_td.text)
- if not state:
- continue
- if debug:
- print(state, file=sys.stderr)
- size, identity = state.popleft().strip().split()
- size = int(size)
- state.extendleft((size, identity))
- dataset = {
- field: value for (field, value) in
- zip(fields, state)
- if field not in skip_fields
- }
- result.append(dataset)
- return json.dumps(collections, indent=4)
- def main(
- inputfile: Union[None, Path],
- debug: bool, *,
- example: bool
- ) -> str:
- """
- Convert the html inputfile to a json string
- and return it.
- If sys.stdin is a pipe, then prefering this as source.
- """
- if example:
- return html2json(example_html, debug)
- if inputfile and sys.stdin.isatty():
- html_source = inputfile.read_text()
- elif not inputfile and not sys.stdin.isatty():
- html_source = sys.stdin.read()
- # reads until the pipe is closed by
- # the previous process: cat for example
- else:
- # should be impossible
- # prefering in this case the stdin
- html_source = sys.stdin.read()
- return html2json(html_source, debug)
- if __name__ == '__main__':
- parser = ArgumentParser(description=__doc__)
- parser.add_argument('-f', dest='inputfile', default=None, type=Path, help='A path to the inputfile, if stdin is not used.')
- parser.add_argument('-d', dest='debug', action='store_true', help='Debug')
- parser.add_argument('-e', dest='example', action='store_true', help='Example with example data')
- args = parser.parse_args()
- json_str = main(**vars(args))
- print(json_str)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement