Added unicode compatibility, better stdout info

This commit is contained in:
Lynne 2018-09-08 23:06:17 +00:00
parent a701196c2e
commit acec5b6668
2 changed files with 69 additions and 67 deletions

12
gen.py
View file

@ -3,19 +3,21 @@ import json
import time import time
from mastodon import Mastodon from mastodon import Mastodon
api_base_url = "https://botsin.space" api_base_url = "https://botsin.space" #todo: this shouldn't be hardcoded
client = Mastodon( client = Mastodon(
client_id="clientcred.secret", client_id="clientcred.secret",
access_token="usercred.secret", access_token="usercred.secret",
api_base_url=api_base_url) api_base_url=api_base_url)
with open("corpus.txt") as fp: with open("corpus.txt", encoding="utf-8") as fp:
model = markovify.NewlineText(fp.read()) model = markovify.NewlineText(fp.read())
print("tooting") print("tooting")
sentence = None sentence = None
# you will make that damn sentence # you will make that damn sentence
while sentence is None: while sentence is None:
sentence = model.make_sentence(tries=100000) sentence = model.make_sentence(tries=100000)
client.toot(sentence.replace("\0", "\n")) toot = sentence.replace("\0", "\n")
client.toot(toot)
print("Created toot: {}".format(toot))

124
main.py
View file

@ -7,84 +7,84 @@ import re
api_base_url = "https://botsin.space" api_base_url = "https://botsin.space"
if not path.exists("clientcred.secret"): if not path.exists("clientcred.secret"):
print("No clientcred.secret, registering application") print("No clientcred.secret, registering application")
Mastodon.create_app("ebooks", api_base_url=api_base_url, to_file="clientcred.secret") Mastodon.create_app("ebooks", api_base_url=api_base_url, to_file="clientcred.secret")
if not path.exists("usercred.secret"): if not path.exists("usercred.secret"):
print("No usercred.secret, registering application") print("No usercred.secret, registering application")
email = input("Email: ") email = input("Email: ")
password = getpass("Password: ") password = getpass("Password: ")
client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url) client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
client.log_in(email, password, to_file="usercred.secret") client.log_in(email, password, to_file="usercred.secret")
def parse_toot(toot): def parse_toot(toot):
if toot.spoiler_text != "": return if toot.spoiler_text != "": return
if toot.reblog is not None: return if toot.reblog is not None: return
if toot.visibility not in ["public", "unlisted"]: return if toot.visibility not in ["public", "unlisted"]: return
soup = BeautifulSoup(toot.content, "html.parser") soup = BeautifulSoup(toot.content, "html.parser")
# pull the mentions out # pull the mentions out
# for mention in soup.select("span.h-card"): # for mention in soup.select("span.h-card"):
# mention.unwrap() # mention.unwrap()
# for mention in soup.select("a.u-url.mention"): # for mention in soup.select("a.u-url.mention"):
# mention.unwrap() # mention.unwrap()
# we will destroy the mentions until we're ready to use them # we will destroy the mentions until we're ready to use them
# someday turbocat, you will talk to your sibilings # someday turbocat, you will talk to your sibilings
for mention in soup.select("span.h-card"): for mention in soup.select("span.h-card"):
mention.decompose() mention.decompose()
# make all linebreaks actual linebreaks # make all linebreaks actual linebreaks
for lb in soup.select("br"): for lb in soup.select("br"):
lb.insert_after("\n") lb.insert_after("\n")
lb.decompose() lb.decompose()
# make each p element its own line because sometimes they decide not to be # make each p element its own line because sometimes they decide not to be
for p in soup.select("p"): for p in soup.select("p"):
p.insert_after("\n") p.insert_after("\n")
p.unwrap() p.unwrap()
# keep hashtags in the toots # keep hashtags in the toots
for ht in soup.select("a.hashtag"): for ht in soup.select("a.hashtag"):
ht.unwrap() ht.unwrap()
# unwrap all links (i like the bots posting links) # unwrap all links (i like the bots posting links)
for link in soup.select("a"): for link in soup.select("a"):
link.insert_after(link["href"]) link.insert_after(link["href"])
link.decompose() link.decompose()
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n")) text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
# next up: store this and patch markovify to take it # next up: store this and patch markovify to take it
# return {"text": text, "mentions": mentions, "links": links} # return {"text": text, "mentions": mentions, "links": links}
# it's 4am though so we're not doing that now, but i still want the parser updates # it's 4am though so we're not doing that now, but i still want the parser updates
return "\0".join(list(text)) return "\0".join(list(text))
def get_toots(client, id): def get_toots(client, id):
i = 0 i = 0
toots = client.account_statuses(id) toots = client.account_statuses(id)
while toots is not None: while toots is not None:
for toot in toots: for toot in toots:
t = parse_toot(toot) t = parse_toot(toot)
if t != None: if t != None:
yield t yield t
toots = client.fetch_next(toots) toots = client.fetch_next(toots)
i += 1 i += 1
if i%10 == 0: if i%10 == 0:
print(i) print(i)
client = Mastodon( client = Mastodon(
client_id="clientcred.secret", client_id="clientcred.secret",
access_token="usercred.secret", access_token="usercred.secret",
api_base_url=api_base_url) api_base_url=api_base_url)
me = client.account_verify_credentials() me = client.account_verify_credentials()
following = client.account_following(me.id) following = client.account_following(me.id)
with open("corpus.txt", "w+") as fp: with open("corpus.txt", "w+", encoding="utf-8") as fp:
for f in following: for f in following:
print(f.username) print("Downloading toots for user @{}".format(f.username))
for t in get_toots(client, f.id): for t in get_toots(client, f.id):
fp.write(t + "\n") fp.write(t + "\n")