Merge branch 'master' into master

This commit is contained in:
Lynne 2019-02-23 10:25:15 +10:00 committed by GitHub
commit 64b49da4eb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 169 additions and 185 deletions

3
.gitignore vendored
View file

@ -7,3 +7,6 @@ toots.db
toots.db-journal toots.db-journal
toots.db-wal toots.db-wal
__pycache__/* __pycache__/*
.vscode/
.editorconfig
.*.swp

View file

@ -1 +1,4 @@
{"site":"https://botsin.space","cw":null} {
"site": "https://botsin.space",
"cw": null
}

View file

@ -1,62 +0,0 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import markovify
import json
import re, random, multiprocessing, time, sqlite3, shutil, os
def make_sentence(output):
class nlt_fixed(markovify.NewlineText):
def test_sentence_input(self, sentence):
return True #all sentences are valid <3
# with open("corpus.txt", encoding="utf-8") as fp:
# model = nlt_fixed(fp.read())
shutil.copyfile("toots.db", "toots-copy.db")
db = sqlite3.connect("toots-copy.db")
db.text_factory=str
c = db.cursor()
toots = c.execute("SELECT content FROM `toots`").fetchall()
toots_str = ""
for toot in toots:
toots_str += "\n{}".format(toot[0])
model = nlt_fixed(toots_str)
toots_str = None
db.close()
os.remove("toots-copy.db")
sentence = None
tries = 0
while sentence is None and tries < 10:
sentence = model.make_short_sentence(500, tries=10000)
tries = tries + 1
sentence = re.sub("^@\u202B[^ ]* ", "", sentence)
output.send(sentence)
def make_toot(force_markov = False, args = None):
return make_toot_markov()
def make_toot_markov(query = None):
tries = 0
toot = None
while toot == None and tries < 25:
pin, pout = multiprocessing.Pipe(False)
p = multiprocessing.Process(target = make_sentence, args = [pout])
p.start()
p.join(10)
if p.is_alive():
p.terminate()
p.join()
toot = None
tries = tries + 1
else:
toot = pin.recv()
if toot == None:
toot = "Toot generation failed! Contact Lynne for assistance."
return {
"toot":toot,
"media":None
}

87
functions.py Executable file
View file

@ -0,0 +1,87 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import markovify
from bs4 import BeautifulSoup
import re, multiprocessing, sqlite3, shutil, os, json
def make_sentence(output):
class nlt_fixed(markovify.NewlineText): #modified version of NewlineText that never rejects sentences
def test_sentence_input(self, sentence):
return True #all sentences are valid <3
shutil.copyfile("toots.db", "toots-copy.db") #create a copy of the database because reply.py will be using the main one
db = sqlite3.connect("toots-copy.db")
db.text_factory=str
c = db.cursor()
toots = c.execute("SELECT content FROM `toots` ORDER BY RANDOM() LIMIT 10000").fetchall()
toots_str = ""
for toot in toots:
toots_str += "\n{}".format(toot[0])
model = nlt_fixed(toots_str)
toots_str = None
db.close()
os.remove("toots-copy.db")
sentence = None
tries = 0
while sentence is None and tries < 10:
sentence = model.make_short_sentence(500, tries=10000)
tries = tries + 1
sentence = re.sub("^(?:@\u202B[^ ]* )*", "", sentence) #remove leading pings (don't say "@bob blah blah" but still say "blah @bob blah")
sentence = re.sub("^(?:@\u200B[^ ]* )*", "", sentence)
output.send(sentence)
def make_toot(force_markov = False, args = None):
return make_toot_markov()
def make_toot_markov(query = None):
tries = 0
toot = None
while toot == None and tries < 10: #try to make a toot 10 times
pin, pout = multiprocessing.Pipe(False)
p = multiprocessing.Process(target = make_sentence, args = [pout])
p.start()
p.join(10) #wait 10 seconds to get something
if p.is_alive(): #if it's still trying to make a toot after 10 seconds
p.terminate()
p.join()
toot = None
tries = tries + 1 #give up, and increment tries by one
else:
toot = pin.recv()
if toot == None: #if we've tried and failed ten times, just give up
toot = "Toot generation failed! Contact Lynne (lynnesbian@fedi.lynnesbian.space) for assistance."
return {
"toot": toot,
"media": None
}
def extract_toot(toot):
toot = toot.replace("&apos;", "'") #convert HTML stuff to normal stuff
toot = toot.replace("&quot;", '"') #ditto
soup = BeautifulSoup(toot, "html.parser")
for lb in soup.select("br"): #replace <br> with linebreak
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"): #ditto for <p>
p.insert_after("\n")
p.unwrap()
for ht in soup.select("a.hashtag"): #make hashtags no longer links, just text
ht.unwrap()
for link in soup.select("a"): #ocnvert <a href='https://example.com>example.com</a> to just https://example.com
link.insert_after(link["href"])
link.decompose()
text = soup.get_text()
text = re.sub("https://([^/]+)/(@[^ ]+)", r"\2@\1", text) #put mastodon-style mentions back in
text = re.sub("https://([^/]+)/users/([^ ]+)", r"@\2@\1", text) #put pleroma-style mentions back in
text = text.rstrip("\n") #remove trailing newline
return text

13
gen.py
View file

@ -5,13 +5,11 @@
from mastodon import Mastodon from mastodon import Mastodon
import argparse, sys, traceback, json import argparse, sys, traceback, json
import create import functions
parser = argparse.ArgumentParser(description='Generate and post a toot.') parser = argparse.ArgumentParser(description='Generate and post a toot.')
parser.add_argument('reply', metavar='reply', type=str, nargs='?',
help='ID of the status to reply to')
parser.add_argument('-s', '--simulate', dest='simulate', action='store_true', parser.add_argument('-s', '--simulate', dest='simulate', action='store_true',
help="Print the toot to stdout without posting it") help="Print the toot without actually posting it. Use this to make sure your bot's actually working.")
args = parser.parse_args() args = parser.parse_args()
@ -24,7 +22,7 @@ client = Mastodon(
api_base_url=cfg['site']) api_base_url=cfg['site'])
if __name__ == '__main__': if __name__ == '__main__':
toot = create.make_toot() toot = functions.make_toot()
if not args.simulate: if not args.simulate:
try: try:
if toot['media'] != None: if toot['media'] != None:
@ -35,10 +33,7 @@ if __name__ == '__main__':
client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = cfg['cw']) client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = cfg['cw'])
except Exception as err: except Exception as err:
toot = { toot = {
"toot": "toot": "An unknown error that should never happen occurred. Maybe it's because of the spoiler text, which is {}. If not, I have no idea what went wrong. This is an error message -- contact lynnesbian@fedi.lynnesbian.space for assistance.".format(cfg['cw'])
"Mistress @lynnesbian@fedi.lynnesbian.space, something has gone terribly" \
+ " wrong! While attempting to post a toot, I received the following" \
+ " error:\n" + "\n".join(traceback.format_tb(sys.exc_info()[2]))
} }
client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = "Error!") client.status_post(toot['toot'], visibility = 'unlisted', spoiler_text = "Error!")
print(toot['toot']) print(toot['toot'])

141
main.py
View file

@ -7,26 +7,43 @@
from mastodon import Mastodon from mastodon import Mastodon
from os import path from os import path
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os, sqlite3, signal, sys, json, re import os, sqlite3, signal, sys, json, re, shutil
import requests import requests
import functions
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications"] scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications"]
cfg = json.load(open('config.json', 'r')) try:
cfg = json.load(open('config.json', 'r'))
except:
shutil.copy2("config.sample.json", "config.json")
cfg = json.load(open('config.json', 'r'))
#config.json *MUST* contain the instance URL, the instance blacklist (for dead/broken instances), and the CW text. if they're not provided, we'll fall back to defaults.
if 'site' not in cfg:
cfg['website'] = "https://botsin.space"
if 'cw' not in cfg:
cfg['cw'] = None
if 'instance_blacklist' not in cfg:
cfg["instance_blacklist"] = [
"bofa.lol",
"witches.town"
]
#if the user is using a (very!) old version that still uses the .secret files, migrate to the new method
if os.path.exists("clientcred.secret"): if os.path.exists("clientcred.secret"):
print("Upgrading to new storage method") print("Upgrading to new storage method")
cc = open("clientcred.secret").read().split("\n") cc = open("clientcred.secret").read().split("\n")
cfg['client'] = { cfg['client'] = {
"id": cc[0], "id": cc[0],
"secret": cc[1] "secret": cc[1]
} }
cfg['secret'] = open("usercred.secret").read().rstrip("\n") cfg['secret'] = open("usercred.secret").read().rstrip("\n")
os.remove("clientcred.secret") os.remove("clientcred.secret")
os.remove("usercred.secret") os.remove("usercred.secret")
if "client" not in cfg: if "client" not in cfg:
print("No client credentials, registering application") print("No application info -- registering application with {}".format(cfg['site']))
client_id, client_secret = Mastodon.create_app("mstdn-ebooks", client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
api_base_url=cfg['site'], api_base_url=cfg['site'],
scopes=scopes, scopes=scopes,
@ -38,47 +55,18 @@ if "client" not in cfg:
} }
if "secret" not in cfg: if "secret" not in cfg:
print("No user credentials, logging in") print("No user credentials -- logging in to {}".format(cfg['site']))
client = Mastodon(client_id = cfg['client']['id'], client = Mastodon(client_id = cfg['client']['id'],
client_secret = cfg['client']['secret'], client_secret = cfg['client']['secret'],
api_base_url=cfg['site']) api_base_url=cfg['site'])
print("Open this URL: {}".format(client.auth_request_url(scopes=scopes))) print("Open this URL and authenticate to give mstdn-ebooks access to your bot's account: {}".format(client.auth_request_url(scopes=scopes)))
cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes) cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
json.dump(cfg, open("config.json", "w+")) json.dump(cfg, open("config.json", "w+"))
def extract_toot(toot): def extract_toot(toot):
toot = toot.replace("&apos;", "'") toot = functions.extract_toot(toot)
toot = toot.replace("&quot;", '"')
soup = BeautifulSoup(toot, "html.parser")
# this is the code that removes all mentions
for mention in soup.select("span.h-card"):
mention.a.unwrap()
mention.span.unwrap()
# replace <br> with linebreak
for lb in soup.select("br"):
lb.insert_after("\n")
lb.decompose()
# replace <p> with linebreak
for p in soup.select("p"):
p.insert_after("\n")
p.unwrap()
# fix hashtags
for ht in soup.select("a.hashtag"):
ht.unwrap()
# fix links
for link in soup.select("a"):
link.insert_after(link["href"])
link.decompose()
toot = soup.get_text()
toot = toot.rstrip("\n") #remove trailing newline
toot = toot.replace("@", "@\u200B") #put a zws between @ and username to avoid mentioning toot = toot.replace("@", "@\u200B") #put a zws between @ and username to avoid mentioning
return(toot) return(toot)
@ -104,25 +92,12 @@ def handleCtrlC(signal, frame):
signal.signal(signal.SIGINT, handleCtrlC) signal.signal(signal.SIGINT, handleCtrlC)
def get_toots_legacy(client, id): patterns = {
i = 0 "handle": re.compile(r"^.*@(.+)"),
toots = client.account_statuses(id) "url": re.compile(r"https?:\/\/(.*)"),
while toots is not None and len(toots) > 0: "uri": re.compile(r'template="([^"]+)"'),
for toot in toots: "pid": re.compile(r"[^\/]+$"),
if toot.spoiler_text != "": continue }
if toot.reblog is not None: continue
if toot.visibility not in ["public", "unlisted"]: continue
t = extract_toot(toot.content)
if t != None:
yield {
"toot": t,
"id": toot.id,
"uri": toot.uri
}
toots = client.fetch_next(toots)
i += 1
if i%20 == 0:
print('.', end='', flush=True)
for f in following: for f in following:
last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone() last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
@ -133,28 +108,27 @@ for f in following:
print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot)) print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
#find the user's activitypub outbox #find the user's activitypub outbox
print("WebFingering...") print("WebFingering... (do not laugh at this. WebFinger is a federated protocol. https://wikipedia.org/wiki/WebFinger)")
instance = re.search(r"^.*@(.+)", f.acct) instance = patterns["handle"].search(f.acct)
if instance == None: if instance == None:
instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1) instance = patterns["url"].search(cfg['site']).group(1)
else: else:
instance = instance.group(1) instance = instance.group(1)
if instance == "bofa.lol": if instance in cfg['instance_blacklist']:
print("rest in piece bofa, skipping") print("skipping blacklisted instance: {}".format(instance))
continue continue
# print("{} is on {}".format(f.acct, instance))
try: try:
r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10) r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10)
uri = re.search(r'template="([^"]+)"', r.text).group(1) uri = patterns["uri"].search(r.text).group(1)
uri = uri.format(uri = "{}@{}".format(f.username, instance)) uri = uri.format(uri = "{}@{}".format(f.username, instance))
r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10) r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10)
j = r.json() j = r.json()
if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it for link in j['links']:
uri = j['aliases'][0] if link['rel'] == 'self':
else: #this is a link formatted like "https://instan.ce/users/username", which is what we need
uri = j['aliases'][1] uri = link['href']
uri = "{}/outbox?page=true".format(uri) uri = "{}/outbox?page=true".format(uri)
r = requests.get(uri, timeout=10) r = requests.get(uri, timeout=10)
j = r.json() j = r.json()
@ -168,23 +142,23 @@ for f in following:
pleroma = True pleroma = True
j = j['first'] j = j['first']
else: else:
print("Mastodon instance detected") print("Mastodon/Misskey instance detected")
uri = "{}&min_id={}".format(uri, last_toot) uri = "{}&min_id={}".format(uri, last_toot)
r = requests.get(uri) r = requests.get(uri)
j = r.json() j = r.json()
print("Downloading and parsing toots", end='', flush=True) print("Downloading and saving toots", end='', flush=True)
done = False done = False
try: try:
while not done and len(j['orderedItems']) > 0: while not done and len(j['orderedItems']) > 0:
for oi in j['orderedItems']: for oi in j['orderedItems']:
if oi['type'] != "Create": if oi['type'] != "Create":
continue #not a toost. fuck outta here continue #this isn't a toot/post/status/whatever, it's a boost or a follow or some other activitypub thing. ignore
# its a toost baby # its a toost baby
content = oi['object']['content'] content = oi['object']['content']
if oi['object']['summary'] != None and oi['object']['summary'] != "": if oi['object']['summary'] != None and oi['object']['summary'] != "":
#don't download CW'd toots #don't download CW'd toots. if you want your bot to download and learn from CW'd toots, replace "continue" with "pass". (todo: add a config.json option for this)
continue continue
toot = extract_toot(content) toot = extract_toot(content)
# print(toot) # print(toot)
@ -192,11 +166,12 @@ for f in following:
if pleroma: if pleroma:
if c.execute("SELECT COUNT(*) FROM toots WHERE uri LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0: if c.execute("SELECT COUNT(*) FROM toots WHERE uri LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0:
#we've caught up to the notices we've already downloaded, so we can stop now #we've caught up to the notices we've already downloaded, so we can stop now
#you might be wondering, "lynne, what if the instance ratelimits you after 40 posts, and they've made 60 since main.py was last run? wouldn't the bot miss 20 posts and never be able to see them?" to which i reply, "it's called mstdn-ebooks not fediverse-ebooks. pleroma support is an afterthought"
done = True done = True
break break
pid = re.search(r"[^\/]+$", oi['object']['id']).group(0) pid = patterns["pid"].search(oi['object']['id']).group(0)
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", (
(pid, pid,
f.id, f.id,
oi['object']['id'], oi['object']['id'],
toot toot
@ -205,7 +180,6 @@ for f in following:
pass pass
except: except:
pass #ignore any toots that don't successfully go into the DB pass #ignore any toots that don't successfully go into the DB
# sys.exit(0)
if not pleroma: if not pleroma:
r = requests.get(j['prev'], timeout=15) r = requests.get(j['prev'], timeout=15)
else: else:
@ -215,9 +189,8 @@ for f in following:
print(" Done!") print(" Done!")
db.commit() db.commit()
except: except:
print("Encountered an error! Saving toots to database and continuing.") print("Encountered an error! Saving toots to database and moving to next followed account.")
db.commit() db.commit()
# db.close()
print("Done!") print("Done!")

View file

@ -5,7 +5,7 @@
import mastodon import mastodon
import os, random, re, json import os, random, re, json
import create import functions
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
cfg = json.load(open('config.json', 'r')) cfg = json.load(open('config.json', 'r'))
@ -17,40 +17,25 @@ client = mastodon.Mastodon(
api_base_url=cfg['site']) api_base_url=cfg['site'])
def extract_toot(toot): def extract_toot(toot):
#copied from main.py, see there for comments text = functions.extract_toot(toot)
soup = BeautifulSoup(toot, "html.parser") text = re.sub(r"^@[^@]+@[^ ]+\s*", r"", text) #remove the initial mention
for lb in soup.select("br"): text = text.lower() #treat text as lowercase for easier keyword matching (if this bot uses it)
lb.insert_after("\n")
lb.decompose()
for p in soup.select("p"):
p.insert_after("\n")
p.unwrap()
for ht in soup.select("a.hashtag"):
ht.unwrap()
for link in soup.select("a"):
link.insert_after(link["href"])
link.decompose()
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
text = "\n".join(list(text))
text = re.sub("https?://([^/]+)/(@[^ ]+)", r"\2@\1", text) #put mentions back in
text = re.sub("^@[^@]+@[^ ]+ *", r"", text) #...but remove the initial one
text = text.lower() #for easier matching
return text return text
class ReplyListener(mastodon.StreamListener): class ReplyListener(mastodon.StreamListener):
def on_notification(self, notification): def on_notification(self, notification): #listen for notifications
if notification['type'] == 'mention': if notification['type'] == 'mention': #if we're mentioned:
acct = "@" + notification['account']['acct'] acct = "@" + notification['account']['acct'] #get the account's @
post_id = notification['status']['id'] post_id = notification['status']['id']
mention = extract_toot(notification['status']['content']) mention = extract_toot(notification['status']['content'])
toot = create.make_toot(True)['toot'] toot = functions.make_toot(True)['toot'] #generate a toot
toot = acct + " " + toot toot = acct + " " + toot #prepend the @
print(acct + " says " + mention) print(acct + " says " + mention) #logging
visibility = notification['status']['visibility'] visibility = notification['status']['visibility']
if visibility == "public": if visibility == "public":
visibility = "unlisted" visibility = "unlisted"
client.status_post(toot, post_id, visibility=visibility, spoiler_text = cfg['cw']) client.status_post(toot, post_id, visibility=visibility, spoiler_text = cfg['cw']) #send toost
print("replied with " + toot) print("replied with " + toot) #logging
rl = ReplyListener() rl = ReplyListener()
client.stream_user(rl) client.stream_user(rl) #go!