article header listing - txtv - Swiss text tv in the terminal

commit dfe240bcf2db7275b35de5a807d148c5008d3243
parent d0a2a55e7a3f664d1815d3259c2d243476264550
Author: Isak Lindhé <isak.e.lindhe@gmail.com>
Date:   Wed,  2 Jan 2019 11:58:51 +0100

article header listing

Diffstat:
A listing.py  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M txtv.py  | 52 ++++++++++++++++++++++++++++++++++++++--------------
A util.py  | 7 +++++++

3 files changed, 112 insertions(+), 14 deletions(-)
diff --git a/listing.py b/listing.py
@@ -0,0 +1,67 @@
+import bs4
+import re
+from txtv import get_page_loop, get_page
+from pprint import pprint
+
+
+def is_content_entry(tag: bs4.element.Tag):
+    # children = [
+    #         c for c in tag.children
+    #         if not (isinstance(c, str) and re.match(r' +', c))
+    #         ]
+    children = list(tag.children)
+    return (
+            tag.name == 'span'
+            and len(children) >= 2
+            and isinstance(children[-1], bs4.element.Tag)
+            and all(isinstance(elem, str) for elem in children[:-1])
+            and children[-1].name == 'a'
+            )
+
+
+def parse_content_entry(tag: bs4.element.Tag) -> tuple:
+    # children = [
+    #         c for c in tag.children
+    #         if not (isinstance(c, str) and re.match(r' +', c))
+    #         ]
+    children = list(tag.children)
+    if is_content_entry(tag):
+        title = re.search(r'^(.+[^.])\.*$', ''.join(children[:-1])).group(1).strip()
+        num = children[-1].get_text()
+        return title, num
+    else:
+        return None, None
+
+
+def parse_content_listing(page: bs4.element.Tag) -> list:
+    return [
+            parse_content_entry(span)
+            for span in page.find_all('span')
+            if is_content_entry(span)
+            ]
+
+def test_content_listing():
+    from pprint import pprint
+    page = get_page(102)[0]
+    content = parse_content_listing(page)
+    pprint(content)
+    assert False
+
+
+def content_list() -> list:
+    import re
+    itempattern = r'(\w+)\.*(\d\d\d)'
+    page = get_page(700)[0]
+    spans = page.find_all('span')
+    spans = [s for s in spans if len(list(s.children)) >= 2 and s.find('a')]
+    return spans
+    # return [re.findall(itempattern, node.get_text()) for node in page]
+
+
+def list_all_articles():
+    full_listing = []
+    for nbr in [101, 104]:
+        pages = get_page_loop(nbr, r'Fler rubriker ([0-9]{3})')
+        for p in pages:
+            full_listing += parse_content_listing(p)
+    return full_listing
diff --git a/txtv.py b/txtv.py
@@ -5,17 +5,14 @@ import requests as rq
 import colorama
 from colorama import Fore, Back, Style
 import sys
-
-LINEWIDTH = 38
-
-def err(txt: str):
-    """Prints a red error message and quits with exit code 1."""
-    print(Fore.RED + txt + Fore.RESET, file=sys.stderr)
-    sys.exit(1)
+import re
 
 
 def get_page_number() -> int:
-    """Parses and input validates the page number argument, returns it as an int."""
+    """
+    Parses and input validates the page number argument,
+    returns it as an int.
+    """
     if len(sys.argv) > 2:
         err('Maybe we will support more arguments in the future, but not today.')
     if len(sys.argv) == 1:
@@ -33,16 +30,34 @@ def get_page(num: int) -> list:
     """
     Returns a list of the tags containing
     the page and potential subpages (type: bs4.element.Tag)
-    on the specified page number. For most pages this will be a list of one element.
+    on the specified page number.
+    For most pages this will be a list of one element.
     """
     res = rq.get(f'https://www.svt.se/svttext/web/pages/{num}.html')
     if res.status_code != 200:
-        err(f'When i tried to get the page i just got HTTP status code {res.status_code}.')
+        err(f'Got HTTP status code {res.status_code}.')
     soup = bs4.BeautifulSoup(res.content, 'html.parser')
     subpages = soup.find_all('pre', class_='root')
     return subpages
 
 
+def get_page_loop(start_num: int, pattern):
+    pages = [get_page(start_num)[0]]
+    while True:
+        match = re.search(pattern, pages[-1].get_text())
+        if not match or match.group(1) == str(start_num):
+            break
+        pages.append(get_page(int(match.group(1)))[0])
+    return pages
+
+def test_page_loop():
+    pages = get_page_loop(101)
+    print(f'number of pages = {len(pages)}')
+    for p in pages:
+        print(p.get_text())
+    assert False
+
+
 def show_page(page: bs4.element.Tag):
     """Prints the page contained by the specified tag in color.""" 
     for node in page:
@@ -58,10 +73,19 @@ def show_page(page: bs4.element.Tag):
             style = Fore.BLUE
         print(style + node.get_text() + Style.RESET_ALL, end='')
 
+def show_headers():
+    from listing import list_all_articles
+    articles = list_all_articles()
+    for title, page_nbr in articles:
+        print(title.ljust(38, '.'), Fore.BLUE + str(page_nbr) + Fore.RESET)
+
 if __name__ == '__main__':
     colorama.init()
-    page_nbr = get_page_number()
-    subpages = get_page(page_nbr)
-    for page in subpages:
-        show_page(page)
+    if sys.argv[1] == 'head':
+        show_headers()
+    else:
+        page_nbr = get_page_number()
+        subpages = get_page(page_nbr)
+        for page in subpages:
+            show_page(page)
     colorama.deinit()
diff --git a/util.py b/util.py
@@ -0,0 +1,7 @@
+from colorama import Fore, Back, Style
+import sys
+
+def err(txt: str):
+    """Prints a red error message and quits with exit code 1."""
+    print(Fore.RED + txt + Fore.RESET, file=sys.stderr)
+    sys.exit(1)

	txtv Swiss text tv in the terminal
	git clone https://git.in0rdr.ch/txtv.git
	Log \| Files \| Refs \| Pull requests \|Archive \| README \| LICENSE

A	listing.py	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	txtv.py	\|	52	++++++++++++++++++++++++++++++++++++++--------------
A	util.py	\|	7	+++++++