Version 2.0, with vastly improved toot fetching capabilities!

This commit is contained in:
Lynne 2018-10-25 12:37:11 +10:00
parent 3d059d0b9b
commit 19899cafee

181
main.py
View file

@ -1,4 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# toot downloader version two!!
# This Source Code Form is subject to the terms of the Mozilla Public # This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this # License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@ -6,98 +7,87 @@
from mastodon import Mastodon from mastodon import Mastodon
from os import path from os import path
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import shutil, os, sqlite3, signal, sys, json import os, sqlite3, signal, sys, json, re
# import re import requests
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"] scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
cfg = json.load(open('config.json', 'r')) cfg = json.load(open('config.json', 'r'))
if not path.exists("clientcred.secret"): if os.path.exists("clientcred.secret"):
print("Upgrading to new storage method")
cc = open("clientcred.secret").read().split("\n")
cfg['client'] = {
"id": cc[0],
"secret": cc[1]
}
cfg['secret'] = open("usercred.secret").read().rstrip("\n")
os.remove("clientcred.secret")
os.remove("usercred.secret")
print("No clientcred.secret, registering application") if "client" not in cfg:
Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=cfg['site'], to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks") print("No client credentials, registering application")
client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
api_base_url=cfg['site'],
scopes=scopes,
website="https://github.com/Lynnesbian/mstdn-ebooks")
if not path.exists("usercred.secret"): cfg['client'] = {
print("No usercred.secret, registering application") "id": client_id,
client = Mastodon(client_id="clientcred.secret", api_base_url=cfg['site']) "secret": client_secret
print("Visit this url:") }
print(client.auth_request_url(scopes=scopes))
client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes)
def parse_toot(toot): if "secret" not in cfg:
if toot.spoiler_text != "": return print("No user credentials, logging in")
if toot.reblog is not None: return client = Mastodon(client_id = cfg['client']['id'],
if toot.visibility not in ["public", "unlisted"]: return client_secret = cfg['client']['secret'],
api_base_url=cfg['site'])
soup = BeautifulSoup(toot.content, "html.parser") print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
json.dump(cfg, open("config.json", "w+"))
def extract_toot(toot):
toot = toot.replace("'", "'")
toot = toot.replace(""", '"')
soup = BeautifulSoup(toot, "html.parser")
# pull the mentions out
# for mention in soup.select("span.h-card"):
# mention.unwrap()
# for mention in soup.select("a.u-url.mention"):
# mention.unwrap()
# this is the code that removes all mentions # this is the code that removes all mentions
# TODO: make it so that it removes the @ and instance but keeps the name # TODO: make it so that it removes the @ and instance but keeps the name
for mention in soup.select("span.h-card"): for mention in soup.select("span.h-card"):
mention.decompose() mention.a.unwrap()
mention.span.unwrap()
# make all linebreaks actual linebreaks # replace <br> with linebreak
for lb in soup.select("br"): for lb in soup.select("br"):
lb.insert_after("\n") lb.insert_after("\n")
lb.decompose() lb.decompose()
# make each p element its own line because sometimes they decide not to be # replace <p> with linebreak
for p in soup.select("p"): for p in soup.select("p"):
p.insert_after("\n") p.insert_after("\n")
p.unwrap() p.unwrap()
# keep hashtags in the toots # fix hashtags
for ht in soup.select("a.hashtag"): for ht in soup.select("a.hashtag"):
ht.unwrap() ht.unwrap()
# unwrap all links (i like the bots posting links) # fix links
for link in soup.select("a"): for link in soup.select("a"):
link.insert_after(link["href"]) link.insert_after(link["href"])
link.decompose() link.decompose()
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n")) toot = soup.get_text()
toot = toot.rstrip("\n") #remove trailing newline
# next up: store this and patch markovify to take it toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
# return {"text": text, "mentions": mentions, "links": links} return(toot)
# it's 4am though so we're not doing that now, but i still want the parser updates
#todo: we split above and join now, which is dumb, but i don't wanna mess with the map code bc i don't understand it uwu
text = "\n".join(list(text))
text = text.replace("&apos;", "'")
return text
def get_toots(client, id, since_id):
i = 0
toots = client.account_statuses(id, since_id = since_id)
while toots is not None and len(toots) > 0:
for toot in toots:
t = parse_toot(toot)
if t != None:
yield {
"content": t,
"id": toot.id
}
try:
toots = client.fetch_next(toots)
except TimeoutError:
print("Operation timed out, committing to database and exiting.")
db.commit()
db.close()
sys.exit(1)
i += 1
if i%10 == 0:
print(i)
client = Mastodon( client = Mastodon(
client_id="clientcred.secret", client_id=cfg['client']['id'],
access_token="usercred.secret", client_secret = cfg['client']['secret'],
api_base_url=cfg['site']) access_token=cfg['secret'],
api_base_url=cfg['site'])
me = client.account_verify_credentials() me = client.account_verify_credentials()
following = client.account_following(me.id) following = client.account_following(me.id)
@ -105,7 +95,7 @@ following = client.account_following(me.id)
db = sqlite3.connect("toots.db") db = sqlite3.connect("toots.db")
db.text_factory=str db.text_factory=str
c = db.cursor() c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID") c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
db.commit() db.commit()
def handleCtrlC(signal, frame): def handleCtrlC(signal, frame):
@ -121,12 +111,65 @@ for f in following:
last_toot = last_toot[0] last_toot = last_toot[0]
else: else:
last_toot = 0 last_toot = 0
print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot)) print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
for t in get_toots(client, f.id, last_toot):
# try: #find the user's activitypub outbox
c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content'])) print("WebFingering...")
# except: instance = re.search(r"^.*@(.+)", f.acct)
# pass #ignore toots that can't be encoded properly if instance == None:
instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
else:
instance = instance.group(1)
# print("{} is on {}".format(f.acct, instance))
try:
r = requests.get("https://{}/.well-known/host-meta".format(instance))
uri = re.search(r'template="([^"]+)"', r.text).group(1)
uri = uri.format(uri = "{}@{}".format(f.username, instance))
r = requests.get(uri)
uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this
uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
r = requests.get(uri)
j = r.json()
except Exception:
print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
sys.exit(1)
print("Downloading and parsing toots", end='', flush=True)
current = None
try:
while len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if oi['type'] == "Create":
# its a toost baby
content = oi['object']['content']
if oi['object']['summary'] != None:
#don't download CW'd toots
continue
toot = extract_toot(content)
# print(toot)
try:
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
(re.search(r"[^\/]+$", oi['object']['id']).group(0),
f.id,
oi['object']['id'],
toot
)
)
pass
except:
pass #ignore any toots that don't go into the DB
# sys.exit(0)
r = requests.get(j['prev'])
j = r.json()
print('.', end='', flush=True)
print(" Done!")
db.commit()
except:
print("Encountered an error! Saving toots to database and exiting.")
db.commit()
db.close()
sys.exit(1)
db.commit() db.commit()
db.execute("VACUUM") #compact db db.execute("VACUUM") #compact db