begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing

This commit is contained in:
Xevion
2020-05-04 06:33:13 -05:00
parent 7810a1930b
commit 5dadf77a11
3 changed files with 50 additions and 8 deletions

3
main.py Normal file
View File

@@ -0,0 +1,3 @@
from tumble.main import Blog
blog = Blog('galinadubpicss')

View File

@@ -1,3 +1,7 @@
docopt~=0.6.2
ratelimit~=2.2.1
setuptools~=46.1.3
setuptools~=46.1.3
bs4~=0.0.1
beautifulsoup4~=4.9.0
requests~=2.23.0
tumble~=0.0.1

View File

@@ -2,15 +2,23 @@
main.py
Contains classes for managing and downloading media from Tumblr
"""
import re
import bs4
import requests
from itertools import count
from typing import Optional
from .misc import pageQuery, mediaQuery
session = requests.Session()
from typing import List
class Blog:
"""
A Blog object assists with downloading media from a specific blog.
It holds very basic information for cycling through all
"""
def __init__(self, blogid, download: bool = True, max_pages: int = 99999):
def __init__(self, blogid, download: bool = True, max_pages: int = -1):
"""
:param download: If true, begin downloading immediately following initialization.
:param max_pages: The maximum number of pages
@@ -31,21 +39,48 @@ class Blog:
Processes the entire Tumblr blog acquiring all Media URLs
:param require_download: The number of media endpoints the function will pass before downloading media early
"""
for page in range(1, self.max_pages):
urls = self.getPage(page)
if urls:
# count up infinitely if a maximum page count is never offered
pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
for page in pages:
urls = self.getMedia(page)
if urls is not None:
print(urls)
else:
print(f'Last page processed ({page}).')
break
def getPage(self, page: int) -> List[str]:
def getMedia(self, page: int) -> Optional[str]:
"""
Processes a Tumblr page on a blog, locating all media URLs.
:param page: The page index
:return: A list of URLs for pictures or videos found on the associated page
"""
pass
data = session.get(self.pageURL(page)).text
soup = bs4.BeautifulSoup(data, 'lxml')
urls = []
posts = soup.find_all(class_=re.compile('post-\d+'))
if len(posts) == 0:
print('No posts found on page. Quitting.')
return None
else:
print(f'{len(posts)} posts found on page.')
for videoTag in soup.find_all(class_='video'):
pass
for photosetTag in soup.find_all(class_='photoset'):
pass
for photoTag in soup.find_all(class_='photo'):
pass
return urls
def pageURL(self, page) -> str:
"""