From a08681e7370c6c6d66ccbbb4a8beebccad94a1fa Mon Sep 17 00:00:00 2001 From: Jess 3Jane Date: Wed, 21 Feb 2018 04:06:35 -0500 Subject: [PATCH] newlines work also did a bunch of work for future silly stuff --- gen.py | 4 ++-- main.py | 51 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/gen.py b/gen.py index a4cd651..3cf1b15 100644 --- a/gen.py +++ b/gen.py @@ -14,8 +14,8 @@ with open("corpus.txt") as fp: model = markovify.NewlineText(fp.read()) print("tooting") -# This is not the best long term fix tbh sentence = None +# you will make that damn sentence while sentence is None: sentence = model.make_sentence(tries=100000) -client.toot(sentence.replace(chr(31), "\n")) +client.toot(sentence.replace("\0", "\n")) diff --git a/main.py b/main.py index 71677af..5b38f54 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ from mastodon import Mastodon from getpass import getpass from os import path +from bs4 import BeautifulSoup import json import re @@ -17,25 +18,49 @@ if not path.exists("usercred.secret"): client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url) client.log_in(email, password, to_file="usercred.secret") -def remove_tags(text): - text = text.strip().replace("
", chr(31)) - TAG_RE = re.compile(r'<[^>]+>') - next_re = TAG_RE.sub('', text) - last = re.sub(r"(?:\@|https?\"//)\S+", "", next_re) - if len(last) > 0: - if last[0] == " ": - last = last[1:] - else: - last = "" - return last +def parse_toot(toot): + soup = BeautifulSoup(toot.content, "html.parser") + if toot.spoiler_text != "": return + if toot.reblog is not None: return + if toot.visibility not in ["public", "unlisted"]: return + + # remove all mentions + for mention in soup.select("span"): + mention.decompose() + + # make all linebreaks actual linebreaks + for lb in soup.select("br"): + lb.insert_after("\n") + lb.decompose() + + # put each p element its own line because sometimes they decide not to be + for p in soup.select("p"): + p.insert_after("\n") + p.unwrap() + + # unwrap all links (i like the bots posting links) + links = [] + for link in soup.select("a"): + links += [link["href"]] + link.decompose() + + text = map(lambda a: a.strip(), soup.get_text().strip().split("\n")) + + mentions = [mention.acct for mention in toot.mentions] + + # next up: store this and patch markovify to take it + # return {"text": text, "mentions": mentions, "links": links} + # it's 4am though so we're not doing that now, but i still want the parser updates + return "\0".join(list(text) + links) def get_toots(client, id): i = 0 toots = client.account_statuses(id) while toots is not None: for toot in toots: - if toot.spoiler_text == "" and toot.reblog is None and toot.visibility in ["public", "unlisted"]: - yield remove_tags(toot.content) + t = parse_toot(toot) + if t != None: + yield t toots = client.fetch_next(toots) i += 1 if i%10 == 0: