mastodon-ebooks/main.py

from mastodon import Mastodon
from getpass import getpass
from os import path
from bs4 import BeautifulSoup
import re

from os import environ

def parse_toot(toot):
	if toot.spoiler_text != "": return
	if toot.reblog is not None: return
	if toot.visibility not in ["public", "unlisted"]: return

	soup = BeautifulSoup(toot.content, "html.parser")
	
	# pull the mentions out
	# for mention in soup.select("span.h-card"):
	#     mention.unwrap()

	# for mention in soup.select("a.u-url.mention"):
	#     mention.unwrap()

	# we will destroy the mentions until we're ready to use them
	# someday turbocat, you will talk to your sibilings
	for mention in soup.select("span.h-card"):
		mention.decompose()
	
	# make all linebreaks actual linebreaks
	for lb in soup.select("br"):
		lb.insert_after("\n")
		lb.decompose()

	# make each p element its own line because sometimes they decide not to be
	for p in soup.select("p"):
		p.insert_after("\n")
		p.unwrap()
	
	# keep hashtags in the toots
	for ht in soup.select("a.hashtag"):
		ht.unwrap()

	# unwrap all links (i like the bots posting links)
	for link in soup.select("a"):
		link.insert_after(link["href"])
		link.decompose()

	text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))

	# next up: store this and patch markovify to take it
	# return {"text": text, "mentions": mentions, "links": links}
	# it's 4am though so we're not doing that now, but i still want the parser updates
	return "\0".join(list(text))

def get_toots(client, id):
	i = 0
	toots = client.account_statuses(id)
	while toots is not None:
		for toot in toots:
			t = parse_toot(toot)
			if t != None:
				yield t
		toots = client.fetch_next(toots)
		i += 1
		if i%10 == 0:
			print(i)

client = Mastodon(
  client_id=environ['client_id'],
  client_secret=environ['client_secret'],
  access_token=environ['access_token'],
  api_base_url=environ['instance'],
)

me = client.account_verify_credentials()
following = client.account_following(me.id)

with open("corpus.txt", "w+", encoding="utf-8") as fp:
	for f in following:
		print("Downloading toots for user @{}".format(f.username))
		for t in get_toots(client, f.id):
			fp.write(t + "\n")
initial commit 2018-02-18 15:38:21 -05:00			`from mastodon import Mastodon`
			`from getpass import getpass`
			`from os import path`
newlines work also did a bunch of work for future silly stuff 2018-02-21 04:06:35 -05:00			`from bs4 import BeautifulSoup`
initial commit 2018-02-18 15:38:21 -05:00			`import re`

Accept keys and stuff from environment variables instead of files, so it's easier to Heroku 2018-09-20 21:36:45 -04:00			`from os import environ`
initial commit 2018-02-18 15:38:21 -05:00
newlines work also did a bunch of work for future silly stuff 2018-02-21 04:06:35 -05:00			`def parse_toot(toot):`
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`if toot.spoiler_text != "": return`
			`if toot.reblog is not None: return`
			`if toot.visibility not in ["public", "unlisted"]: return`
now doesn't heck up hashtags also keeps links in the right place 2018-02-21 15:53:19 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`soup = BeautifulSoup(toot.content, "html.parser")`

			`# pull the mentions out`
			`# for mention in soup.select("span.h-card"):`
			`# mention.unwrap()`
now doesn't heck up hashtags also keeps links in the right place 2018-02-21 15:53:19 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`# for mention in soup.select("a.u-url.mention"):`
			`# mention.unwrap()`
now doesn't heck up hashtags also keeps links in the right place 2018-02-21 15:53:19 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`# we will destroy the mentions until we're ready to use them`
			`# someday turbocat, you will talk to your sibilings`
			`for mention in soup.select("span.h-card"):`
			`mention.decompose()`

			`# make all linebreaks actual linebreaks`
			`for lb in soup.select("br"):`
			`lb.insert_after("\n")`
			`lb.decompose()`
newlines work also did a bunch of work for future silly stuff 2018-02-21 04:06:35 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`# make each p element its own line because sometimes they decide not to be`
			`for p in soup.select("p"):`
			`p.insert_after("\n")`
			`p.unwrap()`

			`# keep hashtags in the toots`
			`for ht in soup.select("a.hashtag"):`
			`ht.unwrap()`
now doesn't heck up hashtags also keeps links in the right place 2018-02-21 15:53:19 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`# unwrap all links (i like the bots posting links)`
			`for link in soup.select("a"):`
			`link.insert_after(link["href"])`
			`link.decompose()`
newlines work also did a bunch of work for future silly stuff 2018-02-21 04:06:35 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))`
newlines work also did a bunch of work for future silly stuff 2018-02-21 04:06:35 -05:00
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`# next up: store this and patch markovify to take it`
			`# return {"text": text, "mentions": mentions, "links": links}`
			`# it's 4am though so we're not doing that now, but i still want the parser updates`
			`return "\0".join(list(text))`
initial commit 2018-02-18 15:38:21 -05:00
			`def get_toots(client, id):`
Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`i = 0`
			`toots = client.account_statuses(id)`
			`while toots is not None:`
			`for toot in toots:`
			`t = parse_toot(toot)`
			`if t != None:`
			`yield t`
			`toots = client.fetch_next(toots)`
			`i += 1`
			`if i%10 == 0:`
			`print(i)`
initial commit 2018-02-18 15:38:21 -05:00
			`client = Mastodon(`
Accept keys and stuff from environment variables instead of files, so it's easier to Heroku 2018-09-20 21:36:45 -04:00			`client_id=environ['client_id'],`
			`client_secret=environ['client_secret'],`
			`access_token=environ['access_token'],`
			`api_base_url=environ['instance'],`
			`)`
initial commit 2018-02-18 15:38:21 -05:00
			`me = client.account_verify_credentials()`
			`following = client.account_following(me.id)`

Added unicode compatibility, better stdout info 2018-09-08 19:06:17 -04:00			`with open("corpus.txt", "w+", encoding="utf-8") as fp:`
			`for f in following:`
			`print("Downloading toots for user @{}".format(f.username))`
			`for t in get_toots(client, f.id):`
			`fp.write(t + "\n")`