From a08681e7370c6c6d66ccbbb4a8beebccad94a1fa Mon Sep 17 00:00:00 2001
From: Jess 3Jane <me@jess.coffee>
Date: Wed, 21 Feb 2018 04:06:35 -0500
Subject: [PATCH] newlines work also did a bunch of work for future silly stuff

---
 gen.py  |  4 ++--
 main.py | 51 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 15 deletions(-)
diff --git a/gen.py b/gen.py
index a4cd651..3cf1b15 100644
--- a/gen.py
+++ b/gen.py
@@ -14,8 +14,8 @@ with open("corpus.txt") as fp:
     model = markovify.NewlineText(fp.read())
 
 print("tooting")
-# This is not the best long term fix tbh
 sentence = None
+# you will make that damn sentence
 while sentence is None:
     sentence = model.make_sentence(tries=100000)
-client.toot(sentence.replace(chr(31), "\n"))
+client.toot(sentence.replace("\0", "\n"))
diff --git a/main.py b/main.py
index 71677af..5b38f54 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,7 @@
 from mastodon import Mastodon
 from getpass import getpass
 from os import path
+from bs4 import BeautifulSoup
 import json
 import re
 
@@ -17,25 +18,49 @@ if not path.exists("usercred.secret"):
     client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
     client.log_in(email, password, to_file="usercred.secret")
 
-def remove_tags(text):
-    text = text.strip().replace("<br>", chr(31))
-    TAG_RE = re.compile(r'<[^>]+>')
-    next_re = TAG_RE.sub('', text)
-    last = re.sub(r"(?:\@|https?\"//)\S+", "", next_re)
-    if len(last) > 0:
-        if last[0] == " ":
-            last = last[1:]
-    else:
-        last = ""
-    return last 
+def parse_toot(toot):
+    soup = BeautifulSoup(toot.content, "html.parser")
+    if toot.spoiler_text != "": return
+    if toot.reblog is not None: return
+    if toot.visibility not in ["public", "unlisted"]: return
+    
+    # remove all mentions
+    for mention in soup.select("span"):
+        mention.decompose()
+    
+    # make all linebreaks actual linebreaks
+    for lb in soup.select("br"):
+        lb.insert_after("\n")
+        lb.decompose()
+
+    # put each p element its own line because sometimes they decide not to be
+    for p in soup.select("p"):
+        p.insert_after("\n")
+        p.unwrap()
+    
+    # unwrap all links (i like the bots posting links)
+    links = []
+    for link in soup.select("a"):
+        links += [link["href"]]
+        link.decompose()
+
+    text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
+
+    mentions = [mention.acct for mention in toot.mentions]
+
+    # next up: store this and patch markovify to take it
+    # return {"text": text, "mentions": mentions, "links": links}
+    # it's 4am though so we're not doing that now, but i still want the parser updates
+    return "\0".join(list(text) + links)
 
 def get_toots(client, id):
     i = 0
     toots = client.account_statuses(id)
     while toots is not None:
         for toot in toots:
-            if toot.spoiler_text == "" and toot.reblog is None and toot.visibility in ["public", "unlisted"]:
-                yield remove_tags(toot.content)
+            t = parse_toot(toot)
+            if t != None:
+                yield t
         toots = client.fetch_next(toots)
         i += 1
         if i%10 == 0: