diff --git a/main.py b/main.py new file mode 100644 index 0000000..38efb2a --- /dev/null +++ b/main.py @@ -0,0 +1,3 @@ +from tumble.main import Blog + +blog = Blog('galinadubpicss') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 063bef7..b844c84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ docopt~=0.6.2 ratelimit~=2.2.1 -setuptools~=46.1.3 \ No newline at end of file +setuptools~=46.1.3 +bs4~=0.0.1 +beautifulsoup4~=4.9.0 +requests~=2.23.0 +tumble~=0.0.1 \ No newline at end of file diff --git a/tumble/main.py b/tumble/main.py index 281fb9f..bf22689 100644 --- a/tumble/main.py +++ b/tumble/main.py @@ -2,15 +2,23 @@ main.py Contains classes for managing and downloading media from Tumblr """ +import re +import bs4 +import requests + +from itertools import count +from typing import Optional +from .misc import pageQuery, mediaQuery + +session = requests.Session() -from typing import List class Blog: """ A Blog object assists with downloading media from a specific blog. It holds very basic information for cycling through all """ - def __init__(self, blogid, download: bool = True, max_pages: int = 99999): + def __init__(self, blogid, download: bool = True, max_pages: int = -1): """ :param download: If true, begin downloading immediately following initialization. :param max_pages: The maximum number of pages @@ -31,21 +39,48 @@ class Blog: Processes the entire Tumblr blog acquiring all Media URLs :param require_download: The number of media endpoints the function will pass before downloading media early """ - for page in range(1, self.max_pages): - urls = self.getPage(page) - if urls: + + # count up infinitely if a maximum page count is never offered + pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages) + + for page in pages: + urls = self.getMedia(page) + if urls is not None: + print(urls) else: print(f'Last page processed ({page}).') break - def getPage(self, page: int) -> List[str]: + def getMedia(self, page: int) -> Optional[str]: """ Processes a Tumblr page on a blog, locating all media URLs. :param page: The page index :return: A list of URLs for pictures or videos found on the associated page """ - pass + + data = session.get(self.pageURL(page)).text + soup = bs4.BeautifulSoup(data, 'lxml') + + urls = [] + + posts = soup.find_all(class_=re.compile('post-\d+')) + if len(posts) == 0: + print('No posts found on page. Quitting.') + return None + else: + print(f'{len(posts)} posts found on page.') + + for videoTag in soup.find_all(class_='video'): + pass + + for photosetTag in soup.find_all(class_='photoset'): + pass + + for photoTag in soup.find_all(class_='photo'): + pass + + return urls def pageURL(self, page) -> str: """