Compare commits

..

2 commits

Author SHA1 Message Date
05ca46ffef
Ignore mypy cache 2021-08-21 15:11:26 +10:00
f67fbefb5e
Preserve newlines in toot corpus
The original code was already trying to do this, but in a way that
Beautiful Soup ended up stripping out. This way preserves the newlines
properly, which will prevent the bot from smooshing together words
accidentally.
2021-08-21 15:09:59 +10:00
2 changed files with 3 additions and 2 deletions

1
.gitignore vendored
View file

@ -6,6 +6,7 @@ meme.jpg
toots.db toots.db
toots.db-journal toots.db-journal
toots.db-wal toots.db-wal
.mypy_cache/
__pycache__/ __pycache__/
__pypackages__/ __pypackages__/
.vscode/ .vscode/

View file

@ -80,10 +80,10 @@ def extract_toot(toot):
toot = html.unescape(toot) # convert HTML escape codes to text toot = html.unescape(toot) # convert HTML escape codes to text
soup = BeautifulSoup(toot, "html.parser") soup = BeautifulSoup(toot, "html.parser")
for lb in soup.select("br"): # replace <br> with linebreak for lb in soup.select("br"): # replace <br> with linebreak
lb.name = "\n" lb.append('\n')
for p in soup.select("p"): # ditto for <p> for p in soup.select("p"): # ditto for <p>
p.name = "\n" p.append('\n\n')
for ht in soup.select("a.hashtag"): # convert hashtags from links to text for ht in soup.select("a.hashtag"): # convert hashtags from links to text
ht.unwrap() ht.unwrap()