mirror of
https://github.com/Xevion/tumble.git
synced 2025-12-06 01:16:45 -06:00
begin beautifulsoup processing, add support for non-photo posts, skeleton for photo/photoset/video processing
This commit is contained in:
3
main.py
Normal file
3
main.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from tumble.main import Blog
|
||||||
|
|
||||||
|
blog = Blog('galinadubpicss')
|
||||||
@@ -1,3 +1,7 @@
|
|||||||
docopt~=0.6.2
|
docopt~=0.6.2
|
||||||
ratelimit~=2.2.1
|
ratelimit~=2.2.1
|
||||||
setuptools~=46.1.3
|
setuptools~=46.1.3
|
||||||
|
bs4~=0.0.1
|
||||||
|
beautifulsoup4~=4.9.0
|
||||||
|
requests~=2.23.0
|
||||||
|
tumble~=0.0.1
|
||||||
@@ -2,15 +2,23 @@
|
|||||||
main.py
|
main.py
|
||||||
Contains classes for managing and downloading media from Tumblr
|
Contains classes for managing and downloading media from Tumblr
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
|
import bs4
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from itertools import count
|
||||||
|
from typing import Optional
|
||||||
|
from .misc import pageQuery, mediaQuery
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
class Blog:
|
class Blog:
|
||||||
"""
|
"""
|
||||||
A Blog object assists with downloading media from a specific blog.
|
A Blog object assists with downloading media from a specific blog.
|
||||||
It holds very basic information for cycling through all
|
It holds very basic information for cycling through all
|
||||||
"""
|
"""
|
||||||
def __init__(self, blogid, download: bool = True, max_pages: int = 99999):
|
def __init__(self, blogid, download: bool = True, max_pages: int = -1):
|
||||||
"""
|
"""
|
||||||
:param download: If true, begin downloading immediately following initialization.
|
:param download: If true, begin downloading immediately following initialization.
|
||||||
:param max_pages: The maximum number of pages
|
:param max_pages: The maximum number of pages
|
||||||
@@ -31,22 +39,49 @@ class Blog:
|
|||||||
Processes the entire Tumblr blog acquiring all Media URLs
|
Processes the entire Tumblr blog acquiring all Media URLs
|
||||||
:param require_download: The number of media endpoints the function will pass before downloading media early
|
:param require_download: The number of media endpoints the function will pass before downloading media early
|
||||||
"""
|
"""
|
||||||
for page in range(1, self.max_pages):
|
|
||||||
urls = self.getPage(page)
|
# count up infinitely if a maximum page count is never offered
|
||||||
if urls:
|
pages = count(start=1) if self.max_pages == -1 else range(1, self.max_pages)
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
urls = self.getMedia(page)
|
||||||
|
if urls is not None:
|
||||||
|
print(urls)
|
||||||
else:
|
else:
|
||||||
print(f'Last page processed ({page}).')
|
print(f'Last page processed ({page}).')
|
||||||
break
|
break
|
||||||
|
|
||||||
def getPage(self, page: int) -> List[str]:
|
def getMedia(self, page: int) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Processes a Tumblr page on a blog, locating all media URLs.
|
Processes a Tumblr page on a blog, locating all media URLs.
|
||||||
|
|
||||||
:param page: The page index
|
:param page: The page index
|
||||||
:return: A list of URLs for pictures or videos found on the associated page
|
:return: A list of URLs for pictures or videos found on the associated page
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
data = session.get(self.pageURL(page)).text
|
||||||
|
soup = bs4.BeautifulSoup(data, 'lxml')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
posts = soup.find_all(class_=re.compile('post-\d+'))
|
||||||
|
if len(posts) == 0:
|
||||||
|
print('No posts found on page. Quitting.')
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
print(f'{len(posts)} posts found on page.')
|
||||||
|
|
||||||
|
for videoTag in soup.find_all(class_='video'):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
for photosetTag in soup.find_all(class_='photoset'):
|
||||||
|
pass
|
||||||
|
|
||||||
|
for photoTag in soup.find_all(class_='photo'):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
def pageURL(self, page) -> str:
|
def pageURL(self, page) -> str:
|
||||||
"""
|
"""
|
||||||
Returns the appropriate URL for a given page, for a given Tumblr blog
|
Returns the appropriate URL for a given page, for a given Tumblr blog
|
||||||
|
|||||||
Reference in New Issue
Block a user