begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing

2026-01-31 00:26:18 -06:00 · 2020-05-04 06:33:13 -05:00
parent 7810a1930b
commit 5dadf77a11
3 changed files with 50 additions and 8 deletions
@@ -0,0 +1,3 @@
+from tumble.main import Blog
+
+blog = Blog('galinadubpicss')
@@ -1,3 +1,7 @@
 docopt~=0.6.2
 ratelimit~=2.2.1
-setuptools~=46.1.3
+setuptools~=46.1.3
+bs4~=0.0.1
+beautifulsoup4~=4.9.0
+requests~=2.23.0
+tumble~=0.0.1
@@ -2,15 +2,23 @@
 main.py
 Contains classes for managing and downloading media from Tumblr
 """
+import re
+import bs4
+import requests
+
+from itertools import count
+from typing import Optional
+from .misc import pageQuery, mediaQuery
+
+session = requests.Session()

-from typing import List

 class Blog:
    """
    A Blog object assists with downloading media from a specific blog.
    It holds very basic information for cycling through all
    """
-    def __init__(self, blogid, download: bool = True, max_pages: int = 99999):
+    def __init__(self, blogid, download: bool = True, max_pages: int = -1):
        """
        :param download: If true, begin downloading immediately following initialization.
        :param max_pages: The maximum number of pages
@@ -31,21 +39,48 @@ class Blog:
        Processes the entire Tumblr blog acquiring all Media URLs
        :param require_download: The number of media endpoints the function will pass before downloading media early
        """
-        for page in range(1, self.max_pages):
-            urls = self.getPage(page)
-            if urls:
+
+        # count up infinitely if a maximum page count is never offered
+        pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
+
+        for page in pages:
+            urls = self.getMedia(page)
+            if urls is not None:
+                print(urls)
            else:
                print(f'Last page processed ({page}).')
                break

-    def getPage(self, page: int) -> List[str]:
+    def getMedia(self, page: int) -> Optional[str]:
        """
        Processes a Tumblr page on a blog, locating all media URLs.

        :param page: The page index
        :return: A list of URLs for pictures or videos found on the associated page
        """
-        pass
+
+        data = session.get(self.pageURL(page)).text
+        soup = bs4.BeautifulSoup(data, 'lxml')
+
+        urls = []
+
+        posts = soup.find_all(class_=re.compile('post-\d+'))
+        if len(posts) == 0:
+            print('No posts found on page. Quitting.')
+            return None
+        else:
+            print(f'{len(posts)} posts found on page.')
+
+        for videoTag in soup.find_all(class_='video'):
+            pass
+
+        for photosetTag in soup.find_all(class_='photoset'):
+            pass
+
+        for photoTag in soup.find_all(class_='photo'):
+            pass
+
+        return urls

    def pageURL(self, page) -> str:
        """