Preserve newlines in toot corpus

The original code was already trying to do this, but in a way that
Beautiful Soup ended up stripping out. This way preserves the newlines
properly, which will prevent the bot from smooshing together words
accidentally.
This commit is contained in:
Danielle McLean 2021-08-21 15:09:59 +10:00
parent c0f8f1da38
commit f67fbefb5e
Signed by: 00dani
GPG key ID: 9DDE1EDE01E3A605

View file

@ -80,10 +80,10 @@ def extract_toot(toot):
toot = html.unescape(toot) # convert HTML escape codes to text toot = html.unescape(toot) # convert HTML escape codes to text
soup = BeautifulSoup(toot, "html.parser") soup = BeautifulSoup(toot, "html.parser")
for lb in soup.select("br"): # replace <br> with linebreak for lb in soup.select("br"): # replace <br> with linebreak
lb.name = "\n" lb.append('\n')
for p in soup.select("p"): # ditto for <p> for p in soup.select("p"): # ditto for <p>
p.name = "\n" p.append('\n\n')
for ht in soup.select("a.hashtag"): # convert hashtags from links to text for ht in soup.select("a.hashtag"): # convert hashtags from links to text
ht.unwrap() ht.unwrap()