2018-02-18 15:38:21 -05:00
|
|
|
from mastodon import Mastodon
|
|
|
|
from getpass import getpass
|
|
|
|
from os import path
|
2018-02-21 04:06:35 -05:00
|
|
|
from bs4 import BeautifulSoup
|
2018-02-18 15:38:21 -05:00
|
|
|
import re
|
|
|
|
|
2018-09-20 21:36:45 -04:00
|
|
|
from os import environ
|
2018-02-18 15:38:21 -05:00
|
|
|
|
2018-02-21 04:06:35 -05:00
|
|
|
def parse_toot(toot):
|
2018-09-08 19:06:17 -04:00
|
|
|
if toot.spoiler_text != "": return
|
|
|
|
if toot.reblog is not None: return
|
|
|
|
if toot.visibility not in ["public", "unlisted"]: return
|
2018-02-21 15:53:19 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
soup = BeautifulSoup(toot.content, "html.parser")
|
|
|
|
|
|
|
|
# pull the mentions out
|
|
|
|
# for mention in soup.select("span.h-card"):
|
|
|
|
# mention.unwrap()
|
2018-02-21 15:53:19 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
# for mention in soup.select("a.u-url.mention"):
|
|
|
|
# mention.unwrap()
|
2018-02-21 15:53:19 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
# we will destroy the mentions until we're ready to use them
|
|
|
|
# someday turbocat, you will talk to your sibilings
|
|
|
|
for mention in soup.select("span.h-card"):
|
|
|
|
mention.decompose()
|
|
|
|
|
|
|
|
# make all linebreaks actual linebreaks
|
|
|
|
for lb in soup.select("br"):
|
|
|
|
lb.insert_after("\n")
|
|
|
|
lb.decompose()
|
2018-02-21 04:06:35 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
# make each p element its own line because sometimes they decide not to be
|
|
|
|
for p in soup.select("p"):
|
|
|
|
p.insert_after("\n")
|
|
|
|
p.unwrap()
|
|
|
|
|
|
|
|
# keep hashtags in the toots
|
|
|
|
for ht in soup.select("a.hashtag"):
|
|
|
|
ht.unwrap()
|
2018-02-21 15:53:19 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
# unwrap all links (i like the bots posting links)
|
|
|
|
for link in soup.select("a"):
|
|
|
|
link.insert_after(link["href"])
|
|
|
|
link.decompose()
|
2018-02-21 04:06:35 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
|
2018-02-21 04:06:35 -05:00
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
# next up: store this and patch markovify to take it
|
|
|
|
# return {"text": text, "mentions": mentions, "links": links}
|
|
|
|
# it's 4am though so we're not doing that now, but i still want the parser updates
|
|
|
|
return "\0".join(list(text))
|
2018-02-18 15:38:21 -05:00
|
|
|
|
|
|
|
def get_toots(client, id):
|
2018-09-08 19:06:17 -04:00
|
|
|
i = 0
|
|
|
|
toots = client.account_statuses(id)
|
|
|
|
while toots is not None:
|
|
|
|
for toot in toots:
|
|
|
|
t = parse_toot(toot)
|
|
|
|
if t != None:
|
|
|
|
yield t
|
|
|
|
toots = client.fetch_next(toots)
|
|
|
|
i += 1
|
|
|
|
if i%10 == 0:
|
|
|
|
print(i)
|
2018-02-18 15:38:21 -05:00
|
|
|
|
|
|
|
client = Mastodon(
|
2018-09-20 21:36:45 -04:00
|
|
|
client_id=environ['client_id'],
|
|
|
|
client_secret=environ['client_secret'],
|
|
|
|
access_token=environ['access_token'],
|
|
|
|
api_base_url=environ['instance'],
|
|
|
|
)
|
2018-02-18 15:38:21 -05:00
|
|
|
|
|
|
|
me = client.account_verify_credentials()
|
|
|
|
following = client.account_following(me.id)
|
|
|
|
|
2018-09-08 19:06:17 -04:00
|
|
|
with open("corpus.txt", "w+", encoding="utf-8") as fp:
|
|
|
|
for f in following:
|
|
|
|
print("Downloading toots for user @{}".format(f.username))
|
|
|
|
for t in get_toots(client, f.id):
|
|
|
|
fp.write(t + "\n")
|