commit 4abcf6cb6b16daa7f7990c82e595b61d041b6ec9
parent 8480e693c60efba759c40c58683c4be6cffaa7de
Author: Andreas Gruhler <andreas.gruhler@adfinis.com>
Date: Mon, 16 Oct 2023 01:27:07 +0200
feat: parse srf entries
Diffstat:
M | txtv/listing.py | | | 69 | --------------------------------------------------------------------- |
M | txtv/txtv.py | | | 81 | +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- |
2 files changed, 54 insertions(+), 96 deletions(-)
diff --git a/txtv/listing.py b/txtv/listing.py
@@ -1,72 +1,3 @@
-import bs4
import re
from txtv.txtv import Page
-from pprint import pprint
-
-
-def get_page_loop(start_num: int, pattern: str) -> list:
- pages = [Page(start_num)]
- while True:
- match = re.search(pattern, pages[-1].subpages[0].get_text())
- if not match or match.group(1) == str(start_num):
- break
- pages.append(Page(int(match.group(1))))
- return pages
-
-
-def is_content_entry(tag: bs4.element.Tag) -> bool:
- # children = [
- # c for c in tag.children
- # if not (isinstance(c, str) and re.match(r' +', c))
- # ]
- # children = list(tag.children)
- # return (
- # tag.name == 'span'
- # and 'W' in tag.attrs.['class']
- # and len(children) >= 2
- # and isinstance(children[-1], bs4.element.Tag)
- # and all(isinstance(elem, str) for elem in children[:-1])
- # and children[-1].name == 'a'
- # )
- return (
- isinstance(tag, bs4.element.Tag)
- and tag.name == 'span'
- and all(not cls.startswith('bg') for cls in tag.attrs['class'])
- and any((c in tag.attrs['class']) for c in ['W', 'C'])
- and not re.fullmatch(' *', tag.get_text())
- )
-
-
-def parse_content_listing(page: Page) -> list:
- raw = ''
- for n in page.subpages[0].children:
- if isinstance(n, str):
- raw += n
- pass
- elif isinstance(n, bs4.element.Tag):
- if 'class' not in n.attrs or all((x not in n.attrs['class']) for x in ['bgB', 'bgY', 'Y']):
- raw += n.get_text()
- entries = raw.splitlines()
- entries = [e for e in entries if not re.fullmatch(' *', e)]
- entries = [parse_content_entry(e) for e in entries]
- return entries
-
-
-def parse_content_entry(line: str) -> tuple:
- m = re.fullmatch(r'(\* )?(.+[^.]).*[^0-9]([0-9]{3})[-f]?', line)
-
- if m:
- return (m.group(2).strip(), m.group(3))
- else:
- # raise RuntimeError(f'LINE DIDNT MATCH! {line}')
- return None
-
-
-def list_all_articles() -> list:
- full_listing = []
- for nbr in [101, 104]:
- pages = get_page_loop(nbr, r'Fler rubriker ([0-9]{3})')
- for p in pages:
- full_listing += parse_content_listing(p)
- return full_listing
diff --git a/txtv/txtv.py b/txtv/txtv.py
@@ -1,10 +1,10 @@
-import bs4
import requests as rq
import sys
import re
import colorama
import readline
import json
+import textwrap
from colorama import Fore, Back, Style
from pathlib import Path
from txtv.util import err
@@ -15,41 +15,51 @@ cfg = get_config()
class Page:
def __init__(self, num: int):
self.num = num
+ self.contententries = []
url = f'https://api.teletext.ch/channels/SRF1/pages/{num}'
try:
res = rq.get(url)
if res.status_code != 200:
err(f'Got HTTP status code {res.status_code}.')
- soup = bs4.BeautifulSoup(res.content, 'html.parser')
- self.subpages = soup.find_all('div', class_='data')
+ page = json.loads(res.content)
+ self.content = page["subpages"][0]["ep1Info"]["contentText"]
+
+ if not self.has_pages():
+ return
+
+ # The subtitles are written uppercase and end with a double column
+ # There can be multiple subtitles on a page (subtitle, stories, subtitle, stories, etc.)
+ stories = self.content.split(": ")
+
+ # Don't start with the title
+ for s in stories[1:]:
+ # This regex separates the stories by three digit page number
+ lines = re.findall(r'.*?\d{3}', s)
+ self.contententries += lines
+
except rq.exceptions.RequestException:
err(f"Could not get '{url}'.")
- def show(self, subpages=None) -> str:
+ def has_pages(self) -> bool:
+ # Leaf pages have the date on top
+ return not re.search(r'\s{2,}\d{2}\.\d{2}\.\d{2}\s\d{2}:\d{2}', self.content)
+
+ def show(self) -> str:
"""Prints the page contained by the specified tag in color."""
out = ''
- for page in subpages or self.subpages:
- pagetext: str = page.get_text()
- pagejson = json.loads(pagetext)
- content = pagejson["subpages"][0]["ep1Info"]["contentText"]
- content = content.replace('\t', '')
- lines = content.splitlines()
- filtered = ''
- for idx, line in enumerate(lines):
- if idx == 0 and not cfg.getboolean('show', 'svt_header'):
- pass
- elif idx == 1 \
- and 'PUBLICERAD' in line \
- and not cfg.getboolean('show', 'publicerad_header'):
- pass
- elif idx == len(lines) - 1 \
- and re.match(r'.* [0-9]{3} +.* [0-9]{3} +.* [0-9]{3}', line) \
- and not cfg.getboolean('show', 'navigation_footer'):
- pass
- else:
- filtered += line.rstrip() + '\n'
- out += filtered
- out = out.strip()
+
+ if not self.has_pages():
+ return textwrap.fill(self.content, 72)
+
+ articles = []
+ for e in self.contententries:
+ articles += [parse_content_entry(e)]
+
+ for art in articles:
+ if art:
+ title, page_nbr = art
+ out += title.ljust(37, '.') + Fore.BLUE + str(page_nbr) + Fore.RESET + '\n'
+
return out
def next_page(self):
@@ -58,6 +68,24 @@ class Page:
def prev_page(self):
return Page(self.num - 1)
+def list_all_articles() -> list:
+ full_listing = []
+ for nbr in [104, 130]:
+ page = Page(nbr)
+ if not page.has_pages():
+ continue
+ full_listing += [parse_content_entry(e) for e in page.contententries]
+ return full_listing
+
+def parse_content_entry(line: str) -> tuple:
+ m = re.fullmatch(r'(\* )?(.+[^.]).*[^0-9]([0-9]{3})[-f]?', line)
+
+ if m:
+ return (m.group(2).strip(), m.group(3))
+ else:
+ # raise RuntimeError(f'LINE DIDNT MATCH! {line}')
+ return None
+
def validate_page_nbr(arg: str) -> int:
"""
Validates a page number, returns as int. Raises ValueError if bad.
@@ -140,7 +168,6 @@ def cmd_prev(state: dict, **kwargs) -> str:
def cmd_list(**kwargs) -> str:
- from txtv.listing import list_all_articles
out = ''
articles = list_all_articles()
for art in articles: