From dd78364f2d481cdabdaeafe2fe841f41600deae8 Mon Sep 17 00:00:00 2001 From: Agatha Rose Date: Sat, 5 Jun 2021 00:14:56 +0300 Subject: [PATCH] Expose overlap ratio and length limit to config --- README.md | 23 ++++++++++++++--------- functions.py | 15 +++++++++++++-- main.py | 7 ++++++- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 6a9434a..a4c4d86 100644 --- a/README.md +++ b/README.md @@ -48,15 +48,20 @@ I recommend that you create your bot's account on a Mastodon instance. Creating ## Configuration Configuring mstdn-ebooks is accomplished by editing `config.json`. If you want to use a different file for configuration, specify it with the `--cfg` argument. For example, if you want to use `/home/lynne/c.json` instead, you would run `python3 main.py --cfg /home/lynne/c.json` instead of just `python3 main.py` -| Setting | Default | Meaning | -|--------------------|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) | -| cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. | -| instance_blacklist | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. | -| learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. | -| mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). | -| max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. | -| strip_paired_punctuation | false | If true, mstdn-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. | +| Setting | Default | Meaning | +|--------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) | +| cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. | +| instance_blacklist | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. | +| learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. | +| mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). | +| max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. | +| strip_paired_punctuation | false | If true, mstdn-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. | +| limit_length | false | If true, the sentence length will be random between `length_lower_limit` and `length_upper_limit` | +| length_lower_limit | 5 | The lower bound in the random number range above. Only matters if `limit_length` is true. | +| length_upper_limit | 50 | The upper bound in the random number range above. Can be the same as `length_lower_limit` to disable randomness. Only matters if `limit_length` is true. | +| overlap_ratio_enabled | false | If true, checks the output's similarity to the original posts. | +| overlap_ratio | 0.7 | The ratio that determins if the output is too similar to original or not. With decreasing ratio, both the interestingness of the output and the likelihood of failing to create output increases. Only matters if `overlap_ratio_enabled` is true. | ## Donating Please don't feel obligated to donate at all. diff --git a/functions.py b/functions.py index 0280cae..a3d6985 100755 --- a/functions.py +++ b/functions.py @@ -5,6 +5,7 @@ import markovify from bs4 import BeautifulSoup +from random import randint import re, multiprocessing, sqlite3, shutil, os, html def make_sentence(output, cfg): @@ -25,7 +26,9 @@ def make_sentence(output, cfg): output.send("Database is empty! Try running main.py.") return - model = nlt_fixed( + nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed + + model = nlt( "\n".join([toot[0] for toot in toots]) ) @@ -34,10 +37,18 @@ def make_sentence(output, cfg): toots_str = None + if cfg['limit_length']: + sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit']) + sentence = None tries = 0 while sentence is None and tries < 10: - sentence = model.make_short_sentence(500, tries=10000) + sentence = model.make_short_sentence( + max_chars=500, + tries=10000, + max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7, + max_words=sentence_len if cfg['limit_length'] else None + ) tries = tries + 1 # optionally remove mentions diff --git a/main.py b/main.py index 83ea266..5fa2e0c 100755 --- a/main.py +++ b/main.py @@ -27,7 +27,12 @@ cfg = { "learn_from_cw": False, "mention_handling": 1, "max_thread_length": 15, - "strip_paired_punctuation": False + "strip_paired_punctuation": False, + "limit_length": False, + "length_lower_limit": 5, + "length_upper_limit": 50, + "overlap_ratio_enabled": False, + "overlap_ratio": 0.7 } try: