From f67fbefb5e7561416417800535761dc0607e1885 Mon Sep 17 00:00:00 2001 From: Danielle McLean Date: Sat, 21 Aug 2021 15:09:59 +1000 Subject: [PATCH] Preserve newlines in toot corpus The original code was already trying to do this, but in a way that Beautiful Soup ended up stripping out. This way preserves the newlines properly, which will prevent the bot from smooshing together words accidentally. --- functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functions.py b/functions.py index d9e38cb..c290358 100644 --- a/functions.py +++ b/functions.py @@ -80,10 +80,10 @@ def extract_toot(toot): toot = html.unescape(toot) # convert HTML escape codes to text soup = BeautifulSoup(toot, "html.parser") for lb in soup.select("br"): # replace
with linebreak - lb.name = "\n" + lb.append('\n') for p in soup.select("p"): # ditto for

- p.name = "\n" + p.append('\n\n') for ht in soup.select("a.hashtag"): # convert hashtags from links to text ht.unwrap()