mirror of
https://github.com/Xevion/tumble.git
synced 2025-12-05 23:16:45 -06:00
begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing
This commit is contained in:
3
main.py
Normal file
3
main.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from tumble.main import Blog
|
||||
|
||||
blog = Blog('galinadubpicss')
|
||||
@@ -1,3 +1,7 @@
|
||||
docopt~=0.6.2
|
||||
ratelimit~=2.2.1
|
||||
setuptools~=46.1.3
|
||||
bs4~=0.0.1
|
||||
beautifulsoup4~=4.9.0
|
||||
requests~=2.23.0
|
||||
tumble~=0.0.1
|
||||
@@ -2,15 +2,23 @@
|
||||
main.py
|
||||
Contains classes for managing and downloading media from Tumblr
|
||||
"""
|
||||
import re
|
||||
import bs4
|
||||
import requests
|
||||
|
||||
from itertools import count
|
||||
from typing import Optional
|
||||
from .misc import pageQuery, mediaQuery
|
||||
|
||||
session = requests.Session()
|
||||
|
||||
from typing import List
|
||||
|
||||
class Blog:
|
||||
"""
|
||||
A Blog object assists with downloading media from a specific blog.
|
||||
It holds very basic information for cycling through all
|
||||
"""
|
||||
def __init__(self, blogid, download: bool = True, max_pages: int = 99999):
|
||||
def __init__(self, blogid, download: bool = True, max_pages: int = -1):
|
||||
"""
|
||||
:param download: If true, begin downloading immediately following initialization.
|
||||
:param max_pages: The maximum number of pages
|
||||
@@ -31,21 +39,48 @@ class Blog:
|
||||
Processes the entire Tumblr blog acquiring all Media URLs
|
||||
:param require_download: The number of media endpoints the function will pass before downloading media early
|
||||
"""
|
||||
for page in range(1, self.max_pages):
|
||||
urls = self.getPage(page)
|
||||
if urls:
|
||||
|
||||
# count up infinitely if a maximum page count is never offered
|
||||
pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
|
||||
|
||||
for page in pages:
|
||||
urls = self.getMedia(page)
|
||||
if urls is not None:
|
||||
print(urls)
|
||||
else:
|
||||
print(f'Last page processed ({page}).')
|
||||
break
|
||||
|
||||
def getPage(self, page: int) -> List[str]:
|
||||
def getMedia(self, page: int) -> Optional[str]:
|
||||
"""
|
||||
Processes a Tumblr page on a blog, locating all media URLs.
|
||||
|
||||
:param page: The page index
|
||||
:return: A list of URLs for pictures or videos found on the associated page
|
||||
"""
|
||||
pass
|
||||
|
||||
data = session.get(self.pageURL(page)).text
|
||||
soup = bs4.BeautifulSoup(data, 'lxml')
|
||||
|
||||
urls = []
|
||||
|
||||
posts = soup.find_all(class_=re.compile('post-\d+'))
|
||||
if len(posts) == 0:
|
||||
print('No posts found on page. Quitting.')
|
||||
return None
|
||||
else:
|
||||
print(f'{len(posts)} posts found on page.')
|
||||
|
||||
for videoTag in soup.find_all(class_='video'):
|
||||
pass
|
||||
|
||||
for photosetTag in soup.find_all(class_='photoset'):
|
||||
pass
|
||||
|
||||
for photoTag in soup.find_all(class_='photo'):
|
||||
pass
|
||||
|
||||
return urls
|
||||
|
||||
def pageURL(self, page) -> str:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user