diff --git a/main.py b/main.py index acf435a..eaa0d3b 100755 --- a/main.py +++ b/main.py @@ -14,16 +14,16 @@ scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"] cfg = json.load(open('config.json', 'r')) if os.path.exists("clientcred.secret"): - print("Upgrading to new storage method") - cc = open("clientcred.secret").read().split("\n") - cfg['client'] = { - "id": cc[0], - "secret": cc[1] - } - cfg['secret'] = open("usercred.secret").read().rstrip("\n") - os.remove("clientcred.secret") - os.remove("usercred.secret") - + print("Upgrading to new storage method") + cc = open("clientcred.secret").read().split("\n") + cfg['client'] = { + "id": cc[0], + "secret": cc[1] + } + cfg['secret'] = open("usercred.secret").read().rstrip("\n") + os.remove("clientcred.secret") + os.remove("usercred.secret") + if "client" not in cfg: print("No client credentials, registering application") @@ -105,6 +105,26 @@ def handleCtrlC(signal, frame): signal.signal(signal.SIGINT, handleCtrlC) +def get_toots_legacy(client, id): + i = 0 + toots = client.account_statuses(id) + while toots is not None and len(toots) > 0: + for toot in toots: + if toot.spoiler_text != "": continue + if toot.reblog is not None: continue + if toot.visibility not in ["public", "unlisted"]: continue + t = extract_toot(toot.content) + if t != None: + yield { + "toot": t, + "id": toot.id, + "uri": toot.uri + } + toots = client.fetch_next(toots) + i += 1 + if i%20 == 0: + print('.', end='', flush=True) + for f in following: last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone() if last_toot != None: @@ -114,7 +134,7 @@ for f in following: print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot)) #find the user's activitypub outbox - #print("WebFingering...") + print("WebFingering...") instance = re.search(r"^.*@(.+)", f.acct) if instance == None: instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1) @@ -124,49 +144,73 @@ for f in following: if instance == "bofa.lol": print("rest in piece bofa, skipping") continue - + # print("{} is on {}".format(f.acct, instance)) try: r = requests.get("https://{}/.well-known/host-meta".format(instance)) uri = re.search(r'template="([^"]+)"', r.text).group(1) uri = uri.format(uri = "{}@{}".format(f.username, instance)) - r = requests.get(uri) - uri = r.json()['aliases'][1] #TODO: find out if it's safe to rely on this + r = requests.get(uri, headers={"Accept": "application/json"}) + j = r.json() + if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it + uri = j['aliases'][0] + else: + uri = j['aliases'][1] uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot) r = requests.get(uri) j = r.json() except Exception: print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)") sys.exit(1) + + pleroma = False + if 'first' in j: + print("{} is a pleroma instance -- falling back to legacy toot collection method".format(instance)) + pleroma = True print("Downloading and parsing toots", end='', flush=True) current = None try: - while len(j['orderedItems']) > 0: - for oi in j['orderedItems']: - if oi['type'] == "Create": - # its a toost baby - content = oi['object']['content'] - if oi['object']['summary'] != None: - #don't download CW'd toots - continue - toot = extract_toot(content) - # print(toot) - try: - c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", - (re.search(r"[^\/]+$", oi['object']['id']).group(0), - f.id, - oi['object']['id'], - toot - ) + if pleroma: + for t in get_toots_legacy(client, f.id): + try: + c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", + (t['id'], + f.id, + t['uri'], + t['toot'] ) - pass - except: - pass #ignore any toots that don't go into the DB - # sys.exit(0) - r = requests.get(j['prev']) - j = r.json() - print('.', end='', flush=True) + ) + except: + pass + + else: + while len(j['orderedItems']) > 0: + for oi in j['orderedItems']: + if (not pleroma and oi['type'] == "Create") or (pleroma and oi['to']['type'] == "Create"): + # its a toost baby + content = oi['object']['content'] + if oi['object']['summary'] != None: + #don't download CW'd toots + continue + toot = extract_toot(content) + # print(toot) + try: + pid = re.search(r"[^\/]+$", oi['object']['id']).group(0) + c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)", + (pid, + f.id, + oi['object']['id'], + toot + ) + ) + pass + except: + pass #ignore any toots that don't go into the DB + # sys.exit(0) + r = requests.get(j['prev']) + j = r.json() + print('.', end='', flush=True) print(" Done!") db.commit() except: