txtv

Swiss text tv in the terminal
git clone https://git.in0rdr.ch/txtv.git
Log | Files | Refs | Pull requests |Archive | README | LICENSE

commit e5c95969f293402302edf07efcc9e8be9a41453f
parent 4abcf6cb6b16daa7f7990c82e595b61d041b6ec9
Author: Andreas Gruhler <andreas.gruhler@adfinis.com>
Date:   Wed, 18 Oct 2023 02:01:20 +0200

feat: improve parsing

Diffstat:
Dtxtv/listing.py | 3---
Mtxtv/txtv.py | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/txtv/listing.py b/txtv/listing.py @@ -1,3 +0,0 @@ -import re -from txtv.txtv import Page - diff --git a/txtv/txtv.py b/txtv/txtv.py @@ -1,6 +1,7 @@ import requests as rq import sys import re +import regex import colorama import readline import json @@ -17,6 +18,12 @@ class Page: self.num = num self.contententries = [] url = f'https://api.teletext.ch/channels/SRF1/pages/{num}' + self.haspages = True + # Either a news page (first format) or a TV page (second) + self.dateformat = r'(\s{2,}\d{2}\.\d{2}\.\d{2}\s\d{2}:\d{2})|(\d{2}.*\d{2}:\d{2}-\d{2}:\d{2})' + self.titleformat = r'(.*?\d{2}\.\d{2}\.\d{2}\s\d{2}:\d{2})|(.*\d{2}:\d{2}-\d{2}:\d{2})' + + try: res = rq.get(url) if res.status_code != 200: @@ -24,40 +31,120 @@ class Page: page = json.loads(res.content) self.content = page["subpages"][0]["ep1Info"]["contentText"] + # Leaf pages have the date on top + self.haspages = not re.search(self.dateformat, self.content) + if not self.has_pages(): + # Extract and remove title from content + split = re.split(self.titleformat, self.content) + split = list(filter(None, split)) + self.title = split[0] + self.content = split[1] return - # The subtitles are written uppercase and end with a double column - # There can be multiple subtitles on a page (subtitle, stories, subtitle, stories, etc.) - stories = self.content.split(": ") + # Keep the uppercase titles + stories = self.content - # Don't start with the title - for s in stories[1:]: - # This regex separates the stories by three digit page number - lines = re.findall(r'.*?\d{3}', s) - self.contententries += lines + if self.num == 100 or self.num == 700: + # Remove actual titles + stories = re.sub("Jetzt auf SRF 1", "", stories) + stories = re.sub("JETZT AUF SRF 1", "", stories) + stories = re.sub("TELETEXT SRF 1", "", stories) + else: + # Remove all uppercase subtitles. There can be multiple + # subtitles on a page (subtitle, stories, subtitle, stories, etc) + stories = regex.sub(r'[\p{Lu}\s-]{9,}[\s:]', '', self.content) + + # Find all three digit numbers, most probably these are page numbers + page_nrs = re.findall(r'\s(\d{3})*[-\/]*(\d{3})([^\d]|$)', stories) + all_page_nrs = [] + + for p in page_nrs: + try: + n = int(p[0]) + all_page_nrs.append(n) + except: + pass + try: + n = int(p[1]) + all_page_nrs.append(n) + except: + pass + + all_page_nrs = [str(p) for p in all_page_nrs] + + entries = [] + i = 0 + entry = "" + + # Split content by whitespaces + chunks = stories.split() + + # Include all chunks on overview/tv pages + if self.num != 100 and self.num != 700: + # Discard the title chunks (headings) + chunks = chunks[4:] + #all_page_nrs = all_page_nrs[2:] + + for chunk in chunks: + if i+1 >= len(all_page_nrs): + # Add all remaining text from that page to last chunk + entry += chunk + " " + continue + + # Add the chunk to the current entry + entry += chunk + " " + + # If the chunk is indeed in the list of next potential page numbers + if all_page_nrs[i] in chunk or chunk in all_page_nrs[i:]: + # Check for ascending page numbers + #if int(page_nrs[i+1]) > int(chunk): + # Add the entry to the list of all entries + entries.append(entry.strip()) + entry = "" + #else: + i += 1 + + # Add last entry with remaining text + entries.append(entry.strip()) + self.contententries = entries except rq.exceptions.RequestException: err(f"Could not get '{url}'.") def has_pages(self) -> bool: - # Leaf pages have the date on top - return not re.search(r'\s{2,}\d{2}\.\d{2}\.\d{2}\s\d{2}:\d{2}', self.content) + if self.num in [100, 104, 180, 130, 500, 150, 700]: + return True + else: + return self.haspages def show(self) -> str: """Prints the page contained by the specified tag in color.""" out = '' if not self.has_pages(): - return textwrap.fill(self.content, 72) + out = '\n' + Style.BRIGHT + textwrap.fill(self.title.strip(), 37) + '\n\n' + Style.RESET_ALL + out += textwrap.fill(self.content.strip(), 37) + return out articles = [] + append = "" for e in self.contententries: - articles += [parse_content_entry(e)] + parsed_entry = parse_content_entry(append + e) + if parsed_entry == None: + # No clear page number found, assume this belongs to next entry + append += e + else: + articles += [parsed_entry] + append = '' + #prev_nr = int(articles[0][1])-1 for art in articles: if art: title, page_nbr = art + # if int(prev_nr)+1 != int(page_nbr): + # print("wrong article order") + #prev_nr = page_nbr out += title.ljust(37, '.') + Fore.BLUE + str(page_nbr) + Fore.RESET + '\n' return out @@ -78,10 +165,10 @@ def list_all_articles() -> list: return full_listing def parse_content_entry(line: str) -> tuple: - m = re.fullmatch(r'(\* )?(.+[^.]).*[^0-9]([0-9]{3})[-f]?', line) + m = re.fullmatch(r'(.*)\s(\d{3}[-\/]*).*', line) if m: - return (m.group(2).strip(), m.group(3)) + return (m.group(1).strip(), m.group(2)) else: # raise RuntimeError(f'LINE DIDNT MATCH! {line}') return None