txtv

Swiss text tv in the terminal
git clone https://git.in0rdr.ch/txtv.git
Log | Files | Refs | Pull requests |Archive | README | LICENSE

commit 4abcf6cb6b16daa7f7990c82e595b61d041b6ec9
parent 8480e693c60efba759c40c58683c4be6cffaa7de
Author: Andreas Gruhler <andreas.gruhler@adfinis.com>
Date:   Mon, 16 Oct 2023 01:27:07 +0200

feat: parse srf entries

Diffstat:
Mtxtv/listing.py | 69---------------------------------------------------------------------
Mtxtv/txtv.py | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
2 files changed, 54 insertions(+), 96 deletions(-)

diff --git a/txtv/listing.py b/txtv/listing.py @@ -1,72 +1,3 @@ -import bs4 import re from txtv.txtv import Page -from pprint import pprint - - -def get_page_loop(start_num: int, pattern: str) -> list: - pages = [Page(start_num)] - while True: - match = re.search(pattern, pages[-1].subpages[0].get_text()) - if not match or match.group(1) == str(start_num): - break - pages.append(Page(int(match.group(1)))) - return pages - - -def is_content_entry(tag: bs4.element.Tag) -> bool: - # children = [ - # c for c in tag.children - # if not (isinstance(c, str) and re.match(r' +', c)) - # ] - # children = list(tag.children) - # return ( - # tag.name == 'span' - # and 'W' in tag.attrs.['class'] - # and len(children) >= 2 - # and isinstance(children[-1], bs4.element.Tag) - # and all(isinstance(elem, str) for elem in children[:-1]) - # and children[-1].name == 'a' - # ) - return ( - isinstance(tag, bs4.element.Tag) - and tag.name == 'span' - and all(not cls.startswith('bg') for cls in tag.attrs['class']) - and any((c in tag.attrs['class']) for c in ['W', 'C']) - and not re.fullmatch(' *', tag.get_text()) - ) - - -def parse_content_listing(page: Page) -> list: - raw = '' - for n in page.subpages[0].children: - if isinstance(n, str): - raw += n - pass - elif isinstance(n, bs4.element.Tag): - if 'class' not in n.attrs or all((x not in n.attrs['class']) for x in ['bgB', 'bgY', 'Y']): - raw += n.get_text() - entries = raw.splitlines() - entries = [e for e in entries if not re.fullmatch(' *', e)] - entries = [parse_content_entry(e) for e in entries] - return entries - - -def parse_content_entry(line: str) -> tuple: - m = re.fullmatch(r'(\* )?(.+[^.]).*[^0-9]([0-9]{3})[-f]?', line) - - if m: - return (m.group(2).strip(), m.group(3)) - else: - # raise RuntimeError(f'LINE DIDNT MATCH! {line}') - return None - - -def list_all_articles() -> list: - full_listing = [] - for nbr in [101, 104]: - pages = get_page_loop(nbr, r'Fler rubriker ([0-9]{3})') - for p in pages: - full_listing += parse_content_listing(p) - return full_listing diff --git a/txtv/txtv.py b/txtv/txtv.py @@ -1,10 +1,10 @@ -import bs4 import requests as rq import sys import re import colorama import readline import json +import textwrap from colorama import Fore, Back, Style from pathlib import Path from txtv.util import err @@ -15,41 +15,51 @@ cfg = get_config() class Page: def __init__(self, num: int): self.num = num + self.contententries = [] url = f'https://api.teletext.ch/channels/SRF1/pages/{num}' try: res = rq.get(url) if res.status_code != 200: err(f'Got HTTP status code {res.status_code}.') - soup = bs4.BeautifulSoup(res.content, 'html.parser') - self.subpages = soup.find_all('div', class_='data') + page = json.loads(res.content) + self.content = page["subpages"][0]["ep1Info"]["contentText"] + + if not self.has_pages(): + return + + # The subtitles are written uppercase and end with a double column + # There can be multiple subtitles on a page (subtitle, stories, subtitle, stories, etc.) + stories = self.content.split(": ") + + # Don't start with the title + for s in stories[1:]: + # This regex separates the stories by three digit page number + lines = re.findall(r'.*?\d{3}', s) + self.contententries += lines + except rq.exceptions.RequestException: err(f"Could not get '{url}'.") - def show(self, subpages=None) -> str: + def has_pages(self) -> bool: + # Leaf pages have the date on top + return not re.search(r'\s{2,}\d{2}\.\d{2}\.\d{2}\s\d{2}:\d{2}', self.content) + + def show(self) -> str: """Prints the page contained by the specified tag in color.""" out = '' - for page in subpages or self.subpages: - pagetext: str = page.get_text() - pagejson = json.loads(pagetext) - content = pagejson["subpages"][0]["ep1Info"]["contentText"] - content = content.replace('\t', '') - lines = content.splitlines() - filtered = '' - for idx, line in enumerate(lines): - if idx == 0 and not cfg.getboolean('show', 'svt_header'): - pass - elif idx == 1 \ - and 'PUBLICERAD' in line \ - and not cfg.getboolean('show', 'publicerad_header'): - pass - elif idx == len(lines) - 1 \ - and re.match(r'.* [0-9]{3} +.* [0-9]{3} +.* [0-9]{3}', line) \ - and not cfg.getboolean('show', 'navigation_footer'): - pass - else: - filtered += line.rstrip() + '\n' - out += filtered - out = out.strip() + + if not self.has_pages(): + return textwrap.fill(self.content, 72) + + articles = [] + for e in self.contententries: + articles += [parse_content_entry(e)] + + for art in articles: + if art: + title, page_nbr = art + out += title.ljust(37, '.') + Fore.BLUE + str(page_nbr) + Fore.RESET + '\n' + return out def next_page(self): @@ -58,6 +68,24 @@ class Page: def prev_page(self): return Page(self.num - 1) +def list_all_articles() -> list: + full_listing = [] + for nbr in [104, 130]: + page = Page(nbr) + if not page.has_pages(): + continue + full_listing += [parse_content_entry(e) for e in page.contententries] + return full_listing + +def parse_content_entry(line: str) -> tuple: + m = re.fullmatch(r'(\* )?(.+[^.]).*[^0-9]([0-9]{3})[-f]?', line) + + if m: + return (m.group(2).strip(), m.group(3)) + else: + # raise RuntimeError(f'LINE DIDNT MATCH! {line}') + return None + def validate_page_nbr(arg: str) -> int: """ Validates a page number, returns as int. Raises ValueError if bad. @@ -140,7 +168,6 @@ def cmd_prev(state: dict, **kwargs) -> str: def cmd_list(**kwargs) -> str: - from txtv.listing import list_all_articles out = '' articles = list_all_articles() for art in articles: