begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing

This commit is contained in:
Xevion
2020-05-04 06:33:13 -05:00
parent 7810a1930b
commit 5dadf77a11
3 changed files with 50 additions and 8 deletions

3
main.py Normal file
View File

@@ -0,0 +1,3 @@
from tumble.main import Blog
blog = Blog('galinadubpicss')

View File

@@ -1,3 +1,7 @@
docopt~=0.6.2 docopt~=0.6.2
ratelimit~=2.2.1 ratelimit~=2.2.1
setuptools~=46.1.3 setuptools~=46.1.3
bs4~=0.0.1
beautifulsoup4~=4.9.0
requests~=2.23.0
tumble~=0.0.1

View File

@@ -2,15 +2,23 @@
main.py main.py
Contains classes for managing and downloading media from Tumblr Contains classes for managing and downloading media from Tumblr
""" """
import re
import bs4
import requests
from itertools import count
from typing import Optional
from .misc import pageQuery, mediaQuery
session = requests.Session()
from typing import List
class Blog: class Blog:
""" """
A Blog object assists with downloading media from a specific blog. A Blog object assists with downloading media from a specific blog.
It holds very basic information for cycling through all It holds very basic information for cycling through all
""" """
def __init__(self, blogid, download: bool = True, max_pages: int = 99999): def __init__(self, blogid, download: bool = True, max_pages: int = -1):
""" """
:param download: If true, begin downloading immediately following initialization. :param download: If true, begin downloading immediately following initialization.
:param max_pages: The maximum number of pages :param max_pages: The maximum number of pages
@@ -31,22 +39,49 @@ class Blog:
Processes the entire Tumblr blog acquiring all Media URLs Processes the entire Tumblr blog acquiring all Media URLs
:param require_download: The number of media endpoints the function will pass before downloading media early :param require_download: The number of media endpoints the function will pass before downloading media early
""" """
for page in range(1, self.max_pages):
urls = self.getPage(page) # count up infinitely if a maximum page count is never offered
if urls: pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
for page in pages:
urls = self.getMedia(page)
if urls is not None:
print(urls)
else: else:
print(f'Last page processed ({page}).') print(f'Last page processed ({page}).')
break break
def getPage(self, page: int) -> List[str]: def getMedia(self, page: int) -> Optional[str]:
""" """
Processes a Tumblr page on a blog, locating all media URLs. Processes a Tumblr page on a blog, locating all media URLs.
:param page: The page index :param page: The page index
:return: A list of URLs for pictures or videos found on the associated page :return: A list of URLs for pictures or videos found on the associated page
""" """
data = session.get(self.pageURL(page)).text
soup = bs4.BeautifulSoup(data, 'lxml')
urls = []
posts = soup.find_all(class_=re.compile('post-\d+'))
if len(posts) == 0:
print('No posts found on page. Quitting.')
return None
else:
print(f'{len(posts)} posts found on page.')
for videoTag in soup.find_all(class_='video'):
pass pass
for photosetTag in soup.find_all(class_='photoset'):
pass
for photoTag in soup.find_all(class_='photo'):
pass
return urls
def pageURL(self, page) -> str: def pageURL(self, page) -> str:
""" """
Returns the appropriate URL for a given page, for a given Tumblr blog Returns the appropriate URL for a given page, for a given Tumblr blog