Added unicode compatibility, better stdout info

2018-09-08 23:06:17 +00:00 · 2018-09-08 23:06:17 +00:00 · acec5b6668
commit acec5b6668
parent a701196c2e
2 changed files with 69 additions and 67 deletions
--- a/gen.py
+++ b/gen.py
@ -3,19 +3,21 @@ import json
 import time
 from mastodon import Mastodon
-api_base_url = "https://botsin.space"
+api_base_url = "https://botsin.space" #todo: this shouldn't be hardcoded
 client = Mastodon(
        client_id="clientcred.secret", 
        access_token="usercred.secret", 
        api_base_url=api_base_url)
-with open("corpus.txt") as fp:
+with open("corpus.txt", encoding="utf-8") as fp:
-    model = markovify.NewlineText(fp.read())
+  model = markovify.NewlineText(fp.read())
 print("tooting")
 sentence = None
 # you will make that damn sentence
 while sentence is None:
-    sentence = model.make_sentence(tries=100000)
+	sentence = model.make_sentence(tries=100000)
-client.toot(sentence.replace("\0", "\n"))
+toot = sentence.replace("\0", "\n")
 client.toot(toot)
 print("Created toot: {}".format(toot))
--- a/main.py
+++ b/main.py
@ -7,84 +7,84 @@ import re
 api_base_url = "https://botsin.space"
 if not path.exists("clientcred.secret"):
-    print("No clientcred.secret, registering application")
+	print("No clientcred.secret, registering application")
-    Mastodon.create_app("ebooks", api_base_url=api_base_url, to_file="clientcred.secret")
+	Mastodon.create_app("ebooks", api_base_url=api_base_url, to_file="clientcred.secret")
 if not path.exists("usercred.secret"):
-    print("No usercred.secret, registering application")
+	print("No usercred.secret, registering application")
-    email = input("Email: ")
+	email = input("Email: ")
-    password = getpass("Password: ")
+	password = getpass("Password: ")
-    client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
+	client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
-    client.log_in(email, password, to_file="usercred.secret")
+	client.log_in(email, password, to_file="usercred.secret")
 def parse_toot(toot):
-    if toot.spoiler_text != "": return
+	if toot.spoiler_text != "": return
-    if toot.reblog is not None: return
+	if toot.reblog is not None: return
-    if toot.visibility not in ["public", "unlisted"]: return
+	if toot.visibility not in ["public", "unlisted"]: return
-    soup = BeautifulSoup(toot.content, "html.parser")
+	soup = BeautifulSoup(toot.content, "html.parser")
-    
+	
-    # pull the mentions out
+	# pull the mentions out
-    # for mention in soup.select("span.h-card"):
+	# for mention in soup.select("span.h-card"):
-    #     mention.unwrap()
+	#     mention.unwrap()
-    # for mention in soup.select("a.u-url.mention"):
+	# for mention in soup.select("a.u-url.mention"):
-    #     mention.unwrap()
+	#     mention.unwrap()
-    # we will destroy the mentions until we're ready to use them
+	# we will destroy the mentions until we're ready to use them
-    # someday turbocat, you will talk to your sibilings
+	# someday turbocat, you will talk to your sibilings
-    for mention in soup.select("span.h-card"):
+	for mention in soup.select("span.h-card"):
-        mention.decompose()
+		mention.decompose()
-    
+	
-    # make all linebreaks actual linebreaks
+	# make all linebreaks actual linebreaks
-    for lb in soup.select("br"):
+	for lb in soup.select("br"):
-        lb.insert_after("\n")
+		lb.insert_after("\n")
-        lb.decompose()
+		lb.decompose()
-    # make each p element its own line because sometimes they decide not to be
+	# make each p element its own line because sometimes they decide not to be
-    for p in soup.select("p"):
+	for p in soup.select("p"):
-        p.insert_after("\n")
+		p.insert_after("\n")
-        p.unwrap()
+		p.unwrap()
-    
+	
-    # keep hashtags in the toots
+	# keep hashtags in the toots
-    for ht in soup.select("a.hashtag"):
+	for ht in soup.select("a.hashtag"):
-        ht.unwrap()
+		ht.unwrap()
-    # unwrap all links (i like the bots posting links)
+	# unwrap all links (i like the bots posting links)
-    for link in soup.select("a"):
+	for link in soup.select("a"):
-        link.insert_after(link["href"])
+		link.insert_after(link["href"])
-        link.decompose()
+		link.decompose()
-    text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
+	text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
-    # next up: store this and patch markovify to take it
+	# next up: store this and patch markovify to take it
-    # return {"text": text, "mentions": mentions, "links": links}
+	# return {"text": text, "mentions": mentions, "links": links}
-    # it's 4am though so we're not doing that now, but i still want the parser updates
+	# it's 4am though so we're not doing that now, but i still want the parser updates
-    return "\0".join(list(text))
+	return "\0".join(list(text))
 def get_toots(client, id):
-    i = 0
+	i = 0
-    toots = client.account_statuses(id)
+	toots = client.account_statuses(id)
-    while toots is not None:
+	while toots is not None:
-        for toot in toots:
+		for toot in toots:
-            t = parse_toot(toot)
+			t = parse_toot(toot)
-            if t != None:
+			if t != None:
-                yield t
+				yield t
-        toots = client.fetch_next(toots)
+		toots = client.fetch_next(toots)
-        i += 1
+		i += 1
-        if i%10 == 0:
+		if i%10 == 0:
-            print(i)
+			print(i)
 client = Mastodon(
-        client_id="clientcred.secret", 
+		client_id="clientcred.secret", 
-        access_token="usercred.secret", 
+		access_token="usercred.secret", 
-        api_base_url=api_base_url)
+		api_base_url=api_base_url)
 me = client.account_verify_credentials()
 following = client.account_following(me.id)
-with open("corpus.txt", "w+") as fp:
+with open("corpus.txt", "w+", encoding="utf-8") as fp:
-    for f in following:
+	for f in following:
-        print(f.username)
+		print("Downloading toots for user @{}".format(f.username))
-        for t in get_toots(client, f.id):
+		for t in get_toots(client, f.id):
-            fp.write(t + "\n")
+			fp.write(t + "\n")