Last active
December 24, 2023 08:07
-
-
Save yukiarimo/f3eea376e96f79e1c537bd2fad93328b to your computer and use it in GitHub Desktop.
Character AI Dialog Extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from bs4 import BeautifulSoup | |
with open("Main - Rushia Uruha.html") as fp: | |
soup = BeautifulSoup(fp, "html.parser") | |
items = [] | |
main_text = None | |
target_text = None | |
for tag in soup.find_all("span", class_="s1"): | |
text = tag.get_text().strip() | |
if text == "Yuki Arimo": | |
if target_text: | |
items.append({"main_text": main_text, "target_text": target_text}) | |
main_text = None | |
target_text = None | |
main_text = "" | |
elif text == "Rushia Uruha": | |
if main_text: | |
items.append({"main_text": main_text, "target_text": target_text}) | |
main_text = None | |
target_text = None | |
target_text = "" | |
elif tag.name == "img": | |
continue | |
else: | |
if main_text is not None: | |
main_text += " " + text | |
elif target_text is not None: | |
target_text += " " + text | |
if main_text or target_text: | |
items.append({"main_text": main_text, "target_text": target_text}) | |
new_list = [] | |
for i in range(len(items)-1): | |
if items[i].get("main_text") and items[i+1].get("target_text"): | |
new_dict = {"main_text": items[i].get("main_text"), "target_text": items[i+1].get("target_text")} | |
new_list.append(new_dict) | |
json.dump(new_list, open("output.json", "w"), ensure_ascii=False, indent=4) |
Author
yukiarimo
commented
Apr 7, 2023
•
- Share your dialog with AI on beta.character.ai (you can use the "link only" option)
- Save the webpage as .webarchive file
- Convert to HTML using: textutil -convert html 'Main - Rushia Uruha.webarchive'
- Use my script
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment