begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing

2026-01-31 02:26:23 -06:00 · 2020-05-04 06:33:13 -05:00
parent 7810a1930b
commit 5dadf77a11
3 changed files with 50 additions and 8 deletions
@@ -0,0 +1,3 @@
 from tumble.main import Blog
 blog = Blog('galinadubpicss')
@@ -1,3 +1,7 @@
 docopt~=0.6.2
 ratelimit~=2.2.1
 setuptools~=46.1.3
 bs4~=0.0.1
 beautifulsoup4~=4.9.0
 requests~=2.23.0
 tumble~=0.0.1
@@ -2,15 +2,23 @@
 main.py
 Contains classes for managing and downloading media from Tumblr
 """
 import re
 import bs4
 import requests
 from itertools import count
 from typing import Optional
 from .misc import pageQuery, mediaQuery
 session = requests.Session()
 from typing import List
 class Blog:
    """
    A Blog object assists with downloading media from a specific blog.
    It holds very basic information for cycling through all
    """
-    def __init__(self, blogid, download: bool = True, max_pages: int = 99999):
+    def __init__(self, blogid, download: bool = True, max_pages: int = -1):
        """
        :param download: If true, begin downloading immediately following initialization.
        :param max_pages: The maximum number of pages
@@ -31,22 +39,49 @@ class Blog:
        Processes the entire Tumblr blog acquiring all Media URLs
        :param require_download: The number of media endpoints the function will pass before downloading media early
        """
-        for page in range(1, self.max_pages):
+
-            urls = self.getPage(page)
+        # count up infinitely if a maximum page count is never offered
-            if urls:
+        pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
        for page in pages:
            urls = self.getMedia(page)
            if urls is not None:
                print(urls)
            else:
                print(f'Last page processed ({page}).')
                break
-    def getPage(self, page: int) -> List[str]:
+    def getMedia(self, page: int) -> Optional[str]:
        """
        Processes a Tumblr page on a blog, locating all media URLs.
        :param page: The page index
        :return: A list of URLs for pictures or videos found on the associated page
        """
        data = session.get(self.pageURL(page)).text
        soup = bs4.BeautifulSoup(data, 'lxml')
        urls = []
        posts = soup.find_all(class_=re.compile('post-\d+'))
        if len(posts) == 0:
            print('No posts found on page. Quitting.')
            return None
        else:
            print(f'{len(posts)} posts found on page.')
        for videoTag in soup.find_all(class_='video'):
            pass
        for photosetTag in soup.find_all(class_='photoset'):
            pass
        for photoTag in soup.find_all(class_='photo'):
            pass
        return urls
    def pageURL(self, page) -> str:
        """
        Returns the appropriate URL for a given page, for a given Tumblr blog
		`@@ -0,0 +1,3 @@`
							`from tumble.main import Blog`

							`blog = Blog('galinadubpicss')`