SansPapyrus683 · July 28, 2024 15:09
diff --git a/twitter.py b/twitter.py
 import json
 import os
 import re
 import shutil
 from datetime import datetime, timedelta
 from email import utils

 import requests


 def load_twt_obj(file: str) -> list:
    raw = open(file, encoding="utf8").read()
    return json.loads(raw[raw.find("=") + 1 :])


 tweets = load_twt_obj("data/tweets.js") + load_twt_obj("data/deleted-tweets.js")

 del_dir = "data/deleted_tweets_media"
 gen_dir = "data/tweets_media"
 for fn in os.listdir(del_dir):
    shutil.copy(os.path.join(del_dir, fn), gen_dir)

 # after getting the actual images this isn't needed but just in case
 all_raw_media = os.listdir(gen_dir)
 all_media = {}
 for i in all_raw_media:
    post_id = i[: i.find("-")]
    img_id = i[i.find("-") + 1 : i.rfind(".")]
    _, ext = os.path.splitext(i)
    if post_id not in all_media:
        all_media[post_id] = {}
    all_media[post_id][img_id] = ext

 # sort them from oldest to newest
 tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))

 handle_fmt = re.compile(r"RT @([^:]*):")
 img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
 os.makedirs("good_media", exist_ok=True)
 all_paths = []

 print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
 for v, t in enumerate(tweets):
    if (v + 1) % 100 == 0:
        print(f"at tweet #{v + 1}")

    t = t["tweet"]
    match = handle_fmt.match(t["full_text"])
    if match is None:
        continue

    handle = match.group(1)
    og_id = t["id"]
    if "media" not in t["entities"]:
        continue

    media = t["extended_entities"]["media"]
    src_id = [m["source_status_id"] for m in media]
    assert len(set(src_id)) == 1  # just a sanity check
    src_id = src_id[0]

    curr_paths = []
    for img_at, m in enumerate(media):
        img_id = img_id_fmt.match(m["media_url"])
        # sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
        if img_id is None:
            continue

        img_id = img_id.group(1)
        if img_id not in all_media.get(og_id, []):
            continue

        ext = all_media[og_id][img_id]
        stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
        sigma_path = f"good_media/{handle}_{src_id}_{img_at}{ext}"

        dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
        img_data = requests.get(dl_url).content
        with open(sigma_path, "wb") as written:
            written.write(img_data)
        curr_paths.append(sigma_path)

        # shutil.copy(stupid_path, sigma_path)
    all_paths.extend(reversed(curr_paths))

 now = datetime.now()
 epoch = datetime(1970, 1, 1)
 for v, p in enumerate(reversed(all_paths)):
    delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
    os.utime(p, times=(delta, delta))
	import json
	import os
	import re
	import shutil
	from datetime import datetime, timedelta
	from email import utils

	import requests


	def load_twt_obj(file: str) -> list:
	raw = open(file, encoding="utf8").read()
	return json.loads(raw[raw.find("=") + 1 :])


	tweets = load_twt_obj("data/tweets.js") + load_twt_obj("data/deleted-tweets.js")

	del_dir = "data/deleted_tweets_media"
	gen_dir = "data/tweets_media"
	for fn in os.listdir(del_dir):
	shutil.copy(os.path.join(del_dir, fn), gen_dir)

	# after getting the actual images this isn't needed but just in case
	all_raw_media = os.listdir(gen_dir)
	all_media = {}
	for i in all_raw_media:
	post_id = i[: i.find("-")]
	img_id = i[i.find("-") + 1 : i.rfind(".")]
	_, ext = os.path.splitext(i)
	if post_id not in all_media:
	all_media[post_id] = {}
	all_media[post_id][img_id] = ext

	# sort them from oldest to newest
	tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))

	handle_fmt = re.compile(r"RT @([^:]*):")
	img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.])\.")
	os.makedirs("good_media", exist_ok=True)
	all_paths = []

	print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
	for v, t in enumerate(tweets):
	if (v + 1) % 100 == 0:
	print(f"at tweet #{v + 1}")

	t = t["tweet"]
	match = handle_fmt.match(t["full_text"])
	if match is None:
	continue

	handle = match.group(1)
	og_id = t["id"]
	if "media" not in t["entities"]:
	continue

	media = t["extended_entities"]["media"]
	src_id = [m["source_status_id"] for m in media]
	assert len(set(src_id)) == 1 # just a sanity check
	src_id = src_id[0]

	curr_paths = []
	for img_at, m in enumerate(media):
	img_id = img_id_fmt.match(m["media_url"])
	# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
	if img_id is None:
	continue

	img_id = img_id.group(1)
	if img_id not in all_media.get(og_id, []):
	continue

	ext = all_media[og_id][img_id]
	stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
	sigma_path = f"good_media/{handle}_{src_id}_{img_at}{ext}"

	dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
	img_data = requests.get(dl_url).content
	with open(sigma_path, "wb") as written:
	written.write(img_data)
	curr_paths.append(sigma_path)

	# shutil.copy(stupid_path, sigma_path)
	all_paths.extend(reversed(curr_paths))

	now = datetime.now()
	epoch = datetime(1970, 1, 1)
	for v, p in enumerate(reversed(all_paths)):
	delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
	os.utime(p, times=(delta, delta))