mirror of
https://github.com/Xevion/exercism.git
synced 2025-12-06 23:14:56 -06:00
track comment scraping system created, rest_api exercise tweaked
This commit is contained in:
87
track_comments.py
Normal file
87
track_comments.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import requests
|
||||
import pprint
|
||||
import datetime
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# A basic function for handling responses from the request module, as well as handling timing and data sizes of the web pages downloaded.
|
||||
def simpleReq(url):
|
||||
start = time.time()
|
||||
data = requests.get(url)
|
||||
if data.status_code != 200:
|
||||
raise ConnectionError(f'A status code other than 200 was received. ({data.status_code} @ {url})')
|
||||
end = time.time()
|
||||
request_timings.append((end - start, len(data.text.encode('utf-16-le'))))
|
||||
return data.text
|
||||
|
||||
def parseComment(name, url, soup):
|
||||
comment = soup.find(get_reflection)
|
||||
if comment['class'] == ['reflection']:
|
||||
return descend(comment)[3].text
|
||||
return ''
|
||||
|
||||
# This script works on the assumption that you have a setup similar to mine.
|
||||
# I have a GitHub repository linked so that on any computer I can have my Exercism progress kept in one place.
|
||||
# This script may break if you don't have a track folder available (i.e you have a `java` track started but no folder on the computer available.)
|
||||
|
||||
# Constants & Reused Lambdas
|
||||
username = 'Xevion' # CaSe SeNsItIvE Username from `Exercism.io`.
|
||||
t1 = time.time()
|
||||
request_timings = []
|
||||
descend = lambda thing : list(thing.children)
|
||||
get_solutions = lambda tag : ['solution'] == tag['class'] if tag.has_attr('class') else False
|
||||
get_reflection = lambda tag : any(['reflection' in classtag for classtag in tag['class']]) if tag.has_attr('class') else False
|
||||
get_url = lambda url : re.findall(r'exercism.io\/tracks\/([a-z-]+)\/exercises\/', url)[0]
|
||||
get_name_from_url = lambda url : re.search(r'exercism.io\/tracks\/[a-z-]+\/exercises\/([a-z-]+)\/solutions\/', url).group(1)
|
||||
pp = pprint.PrettyPrinter()
|
||||
print('Requesting Profile Page Data')
|
||||
data = simpleReq('https://exercism.io/profiles/{}'.format(username))
|
||||
soup = bs4.BeautifulSoup(data, 'html.parser')
|
||||
comments_top = """# {0} Track Comments\n\nThis page represents all my comments from my solutions currently hosted on [Exercism.io](https://exercism.io/). You can view my profile [here](https://exercism.io/profiles/Xevion).
|
||||
The reason for this is simply to have a place where I can collect my comments, as well as just have some fun with Python and webscraping. Exercise file and exercise submission links will be provided for each and every exercise.
|
||||
This file is for the **{0}** track, contains **{1}** submissions, **{2}** of which have comments. This file was built {3}.\n\n"""
|
||||
|
||||
# Find all completed exercises, find the URL to the Solution by the user and create for parsing.
|
||||
# Also creates dictionary of all available tracks
|
||||
solutions = soup.find_all(get_solutions)
|
||||
solutions = [(descend(descend(solution)[2])[1].text, 'https://exercism.io{}'.format(descend(solution.parent)[1]['href'])) for solution in solutions]
|
||||
tracks = {k : list() for k in dict.fromkeys(list(map(lambda i : get_url(i[1]), solutions)))}
|
||||
|
||||
# Get all comment data & parse, then put into track dictionary
|
||||
print('Requesting Page Data for {} solution{} from {} {}'.format(len(solutions), 's' if len(solutions) > 1 else '', len(tracks.keys()), 'different tracks' if abs(len(tracks.keys())) != 1 else 'track'))
|
||||
solutions = [(solution[0], solution[1], bs4.BeautifulSoup(simpleReq(solution[1]), 'html.parser')) for solution in solutions]
|
||||
solutions = [{'name' : solution[0], 'url' : solution[1], 'comment' : parseComment(*solution)} for solution in solutions]
|
||||
|
||||
# Send all the solutions to their appropriate tracks.
|
||||
for solution in solutions:
|
||||
track = get_url(solution['url'])
|
||||
tracks[track].append(solution)
|
||||
|
||||
# Parse into a readable markdown format
|
||||
print('Parsing all solution comments')
|
||||
for track in tracks.keys():
|
||||
# Getting the path and formatting the top portion of the markdown file.
|
||||
path = os.path.join(sys.path[0], track, 'COMMENTS.md')
|
||||
submission_comments = len(list(filter(lambda item : item['comment'] != '', tracks[track])))
|
||||
top = comments_top.format(track.title(), len(tracks[track]), submission_comments, datetime.datetime.utcnow().strftime('on **%d-%m-%Y** at **%H:%M:%S UTC**'))
|
||||
|
||||
# Adding all the comments with proper formatting and links.
|
||||
markdown_comments = []
|
||||
for submission in tracks[track]:
|
||||
true_name = get_name_from_url(submission['url'])
|
||||
file_url = './{}/{}'.format(true_name, true_name.replace('-', '_') + '.py')
|
||||
comment = "## {}\n\n[Link to File]({}) | [Link to Submission]({})\n\n{}".format(submission['name'], file_url, submission['url'], submission['comment'])
|
||||
markdown_comments.append(comment)
|
||||
|
||||
# Join into a single string and then write it into a file.
|
||||
markdown = top + '\n\n'.join(markdown_comments)
|
||||
with open(path, 'w+') as file:
|
||||
file.write(markdown)
|
||||
print('Wrote {} KiB file for {} track'.format(round(os.path.getsize(path) / 1024, 2), track))
|
||||
t2 = time.time()
|
||||
|
||||
# Sorry for this ridiculously long line. ;-;
|
||||
print('Downloaded {} MiB in webpages.\nDownloaded & parsed in {} seconds with {}ms on average request time.'.format(round(sum([i[1] for i in request_timings]) / (1024 ** 2), 2), round(t2 - t1, 2), round((sum(i[0] for i in request_timings) / len(request_timings)) * 1000, 2)))
|
||||
Reference in New Issue
Block a user