commit dfe240bcf2db7275b35de5a807d148c5008d3243
parent d0a2a55e7a3f664d1815d3259c2d243476264550
Author: Isak Lindhé <isak.e.lindhe@gmail.com>
Date: Wed, 2 Jan 2019 11:58:51 +0100
article header listing
Diffstat:
A | listing.py | | | 67 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | txtv.py | | | 52 | ++++++++++++++++++++++++++++++++++++++-------------- |
A | util.py | | | 7 | +++++++ |
3 files changed, 112 insertions(+), 14 deletions(-)
diff --git a/listing.py b/listing.py
@@ -0,0 +1,67 @@
+import bs4
+import re
+from txtv import get_page_loop, get_page
+from pprint import pprint
+
+
+def is_content_entry(tag: bs4.element.Tag):
+ # children = [
+ # c for c in tag.children
+ # if not (isinstance(c, str) and re.match(r' +', c))
+ # ]
+ children = list(tag.children)
+ return (
+ tag.name == 'span'
+ and len(children) >= 2
+ and isinstance(children[-1], bs4.element.Tag)
+ and all(isinstance(elem, str) for elem in children[:-1])
+ and children[-1].name == 'a'
+ )
+
+
+def parse_content_entry(tag: bs4.element.Tag) -> tuple:
+ # children = [
+ # c for c in tag.children
+ # if not (isinstance(c, str) and re.match(r' +', c))
+ # ]
+ children = list(tag.children)
+ if is_content_entry(tag):
+ title = re.search(r'^(.+[^.])\.*$', ''.join(children[:-1])).group(1).strip()
+ num = children[-1].get_text()
+ return title, num
+ else:
+ return None, None
+
+
+def parse_content_listing(page: bs4.element.Tag) -> list:
+ return [
+ parse_content_entry(span)
+ for span in page.find_all('span')
+ if is_content_entry(span)
+ ]
+
+def test_content_listing():
+ from pprint import pprint
+ page = get_page(102)[0]
+ content = parse_content_listing(page)
+ pprint(content)
+ assert False
+
+
+def content_list() -> list:
+ import re
+ itempattern = r'(\w+)\.*(\d\d\d)'
+ page = get_page(700)[0]
+ spans = page.find_all('span')
+ spans = [s for s in spans if len(list(s.children)) >= 2 and s.find('a')]
+ return spans
+ # return [re.findall(itempattern, node.get_text()) for node in page]
+
+
+def list_all_articles():
+ full_listing = []
+ for nbr in [101, 104]:
+ pages = get_page_loop(nbr, r'Fler rubriker ([0-9]{3})')
+ for p in pages:
+ full_listing += parse_content_listing(p)
+ return full_listing
diff --git a/txtv.py b/txtv.py
@@ -5,17 +5,14 @@ import requests as rq
import colorama
from colorama import Fore, Back, Style
import sys
-
-LINEWIDTH = 38
-
-def err(txt: str):
- """Prints a red error message and quits with exit code 1."""
- print(Fore.RED + txt + Fore.RESET, file=sys.stderr)
- sys.exit(1)
+import re
def get_page_number() -> int:
- """Parses and input validates the page number argument, returns it as an int."""
+ """
+ Parses and input validates the page number argument,
+ returns it as an int.
+ """
if len(sys.argv) > 2:
err('Maybe we will support more arguments in the future, but not today.')
if len(sys.argv) == 1:
@@ -33,16 +30,34 @@ def get_page(num: int) -> list:
"""
Returns a list of the tags containing
the page and potential subpages (type: bs4.element.Tag)
- on the specified page number. For most pages this will be a list of one element.
+ on the specified page number.
+ For most pages this will be a list of one element.
"""
res = rq.get(f'https://www.svt.se/svttext/web/pages/{num}.html')
if res.status_code != 200:
- err(f'When i tried to get the page i just got HTTP status code {res.status_code}.')
+ err(f'Got HTTP status code {res.status_code}.')
soup = bs4.BeautifulSoup(res.content, 'html.parser')
subpages = soup.find_all('pre', class_='root')
return subpages
+def get_page_loop(start_num: int, pattern):
+ pages = [get_page(start_num)[0]]
+ while True:
+ match = re.search(pattern, pages[-1].get_text())
+ if not match or match.group(1) == str(start_num):
+ break
+ pages.append(get_page(int(match.group(1)))[0])
+ return pages
+
+def test_page_loop():
+ pages = get_page_loop(101)
+ print(f'number of pages = {len(pages)}')
+ for p in pages:
+ print(p.get_text())
+ assert False
+
+
def show_page(page: bs4.element.Tag):
"""Prints the page contained by the specified tag in color."""
for node in page:
@@ -58,10 +73,19 @@ def show_page(page: bs4.element.Tag):
style = Fore.BLUE
print(style + node.get_text() + Style.RESET_ALL, end='')
+def show_headers():
+ from listing import list_all_articles
+ articles = list_all_articles()
+ for title, page_nbr in articles:
+ print(title.ljust(38, '.'), Fore.BLUE + str(page_nbr) + Fore.RESET)
+
if __name__ == '__main__':
colorama.init()
- page_nbr = get_page_number()
- subpages = get_page(page_nbr)
- for page in subpages:
- show_page(page)
+ if sys.argv[1] == 'head':
+ show_headers()
+ else:
+ page_nbr = get_page_number()
+ subpages = get_page(page_nbr)
+ for page in subpages:
+ show_page(page)
colorama.deinit()
diff --git a/util.py b/util.py
@@ -0,0 +1,7 @@
+from colorama import Fore, Back, Style
+import sys
+
+def err(txt: str):
+ """Prints a red error message and quits with exit code 1."""
+ print(Fore.RED + txt + Fore.RESET, file=sys.stderr)
+ sys.exit(1)