txtv

Swiss text tv in the terminal
git clone https://git.in0rdr.ch/txtv.git
Log | Files | Refs | Pull requests |Archive | README | LICENSE

commit dfe240bcf2db7275b35de5a807d148c5008d3243
parent d0a2a55e7a3f664d1815d3259c2d243476264550
Author: Isak Lindhé <isak.e.lindhe@gmail.com>
Date:   Wed,  2 Jan 2019 11:58:51 +0100

article header listing

Diffstat:
Alisting.py | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtxtv.py | 52++++++++++++++++++++++++++++++++++++++--------------
Autil.py | 7+++++++
3 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/listing.py b/listing.py @@ -0,0 +1,67 @@ +import bs4 +import re +from txtv import get_page_loop, get_page +from pprint import pprint + + +def is_content_entry(tag: bs4.element.Tag): + # children = [ + # c for c in tag.children + # if not (isinstance(c, str) and re.match(r' +', c)) + # ] + children = list(tag.children) + return ( + tag.name == 'span' + and len(children) >= 2 + and isinstance(children[-1], bs4.element.Tag) + and all(isinstance(elem, str) for elem in children[:-1]) + and children[-1].name == 'a' + ) + + +def parse_content_entry(tag: bs4.element.Tag) -> tuple: + # children = [ + # c for c in tag.children + # if not (isinstance(c, str) and re.match(r' +', c)) + # ] + children = list(tag.children) + if is_content_entry(tag): + title = re.search(r'^(.+[^.])\.*$', ''.join(children[:-1])).group(1).strip() + num = children[-1].get_text() + return title, num + else: + return None, None + + +def parse_content_listing(page: bs4.element.Tag) -> list: + return [ + parse_content_entry(span) + for span in page.find_all('span') + if is_content_entry(span) + ] + +def test_content_listing(): + from pprint import pprint + page = get_page(102)[0] + content = parse_content_listing(page) + pprint(content) + assert False + + +def content_list() -> list: + import re + itempattern = r'(\w+)\.*(\d\d\d)' + page = get_page(700)[0] + spans = page.find_all('span') + spans = [s for s in spans if len(list(s.children)) >= 2 and s.find('a')] + return spans + # return [re.findall(itempattern, node.get_text()) for node in page] + + +def list_all_articles(): + full_listing = [] + for nbr in [101, 104]: + pages = get_page_loop(nbr, r'Fler rubriker ([0-9]{3})') + for p in pages: + full_listing += parse_content_listing(p) + return full_listing diff --git a/txtv.py b/txtv.py @@ -5,17 +5,14 @@ import requests as rq import colorama from colorama import Fore, Back, Style import sys - -LINEWIDTH = 38 - -def err(txt: str): - """Prints a red error message and quits with exit code 1.""" - print(Fore.RED + txt + Fore.RESET, file=sys.stderr) - sys.exit(1) +import re def get_page_number() -> int: - """Parses and input validates the page number argument, returns it as an int.""" + """ + Parses and input validates the page number argument, + returns it as an int. + """ if len(sys.argv) > 2: err('Maybe we will support more arguments in the future, but not today.') if len(sys.argv) == 1: @@ -33,16 +30,34 @@ def get_page(num: int) -> list: """ Returns a list of the tags containing the page and potential subpages (type: bs4.element.Tag) - on the specified page number. For most pages this will be a list of one element. + on the specified page number. + For most pages this will be a list of one element. """ res = rq.get(f'https://www.svt.se/svttext/web/pages/{num}.html') if res.status_code != 200: - err(f'When i tried to get the page i just got HTTP status code {res.status_code}.') + err(f'Got HTTP status code {res.status_code}.') soup = bs4.BeautifulSoup(res.content, 'html.parser') subpages = soup.find_all('pre', class_='root') return subpages +def get_page_loop(start_num: int, pattern): + pages = [get_page(start_num)[0]] + while True: + match = re.search(pattern, pages[-1].get_text()) + if not match or match.group(1) == str(start_num): + break + pages.append(get_page(int(match.group(1)))[0]) + return pages + +def test_page_loop(): + pages = get_page_loop(101) + print(f'number of pages = {len(pages)}') + for p in pages: + print(p.get_text()) + assert False + + def show_page(page: bs4.element.Tag): """Prints the page contained by the specified tag in color.""" for node in page: @@ -58,10 +73,19 @@ def show_page(page: bs4.element.Tag): style = Fore.BLUE print(style + node.get_text() + Style.RESET_ALL, end='') +def show_headers(): + from listing import list_all_articles + articles = list_all_articles() + for title, page_nbr in articles: + print(title.ljust(38, '.'), Fore.BLUE + str(page_nbr) + Fore.RESET) + if __name__ == '__main__': colorama.init() - page_nbr = get_page_number() - subpages = get_page(page_nbr) - for page in subpages: - show_page(page) + if sys.argv[1] == 'head': + show_headers() + else: + page_nbr = get_page_number() + subpages = get_page(page_nbr) + for page in subpages: + show_page(page) colorama.deinit() diff --git a/util.py b/util.py @@ -0,0 +1,7 @@ +from colorama import Fore, Back, Style +import sys + +def err(txt: str): + """Prints a red error message and quits with exit code 1.""" + print(Fore.RED + txt + Fore.RESET, file=sys.stderr) + sys.exit(1)