Cyberes · May 26, 2023 07:06 · Cyberes · May 26, 2023
diff --git a/sillytavern-chat-to-txt-gpt4.py b/sillytavern-chat-to-txt-gpt4.py
 #!/usr/bin/env python3
 import argparse
 import re
 from pathlib import Path
 import sys
 import json
 import openai
 import tiktoken
 import threading
 import traceback

 """
 Convert SillyTavern jsonl chats to TXT files using AI.

 HOW TO USE:
 1. `pip install tiktoken openai`
 2. Find the chat file you want to convert. It's the `jsonl` file located in `SillyTavern/public/chats/<character name>/`
 3. Run this script with `python3 sillytavern-chat-to-txt.py <path to the jsonl file> <path to where you want to save the TXT file> --key <your OpenAI API key>`

 This uses a temperature of 0 so don't re-run this expecting something different.

 If your chat is larger than the context window it will be sent in batches. After each batch, the response is written to your output file.
 """

 class TimerThread(threading.Thread):
    def __init__(self, prompt:str='Waiting for response...'):
        super().__init__()
        self._stop_event = threading.Event()
        self.prompt = prompt

    def run(self):
        seconds = 0
        while not self._stop_event.is_set():
            print(f"\r{self.prompt} {seconds}s", end="")
            seconds += 1
            self._stop_event.wait(1)

    def stop(self):
        self._stop_event.set()
        print('')


 def count_tokens(string: str, encoding_name: str = 'cl100k_base', encoding_for_model: str = None) -> int:
    if encoding_for_model:
        enc = tiktoken.encoding_for_model(encoding_for_model)
    else:
        enc = tiktoken.get_encoding(encoding_name)
    num_tokens = len(enc.encode(string))
    return num_tokens

 def array_of_dicts_to_jsonl(array_of_dicts):
    jsonl_string = "\n".join(json.dumps(d) for d in array_of_dicts)
    return jsonl_string

 def send_to_openai(msg, model):
    user_msg = f"I have a jsonl transcript of an internet roleplay session. I need you to strip everything that isn't important to the story and write a summary of each message. For each message, use the format:\n\n```\n<character name>:\n<what the character says and important actions. don't use double linebreaks except for separating characters>```\n\nSome messages include things that other characters say so please organize it accordingly.\n\n{msg}"
    timer_thread = TimerThread(prompt=f'Sending {count_tokens(user_msg)} tokens to the AI...')
    timer_thread.start()
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": user_msg}
            ],
            temperature=0,
        )
    except Exception as e:
        print('Exception:', e)
        sys.exit(1)

    timer_thread.stop()
    return response

 def main():
    parser = argparse.ArgumentParser(description='Convert SillyTavern jsonl files to TXT files using AI for importing into the infinite context server.')
    parser.add_argument('filepath', help='The path to the jsonl file to parse')
    parser.add_argument('output_txt', help='The output TXT file to create.')
    parser.add_argument('--key', required=True, help='Your OpenAI API key')
    parser.add_argument('--model', default='gpt-4', help='Name of the OpenAI model to use. GPT-4 seems to work the best for this. Default: gpt-4')
    args = parser.parse_args()

    openai.api_key = args.key

    input_jsonl = Path(args.filepath).expanduser().absolute().resolve()
    output_txt = Path(args.output_txt).expanduser().absolute().resolve()

    # Empty the file since we append to it
    output_txt.unlink()
    output_txt.touch()

    print('Converting chat:', input_jsonl)
    print('Using model:', args.model)

    if not input_jsonl.exists():
        print('Input file does not exist:', input_jsonl)
        sys.exit(1)
    if not output_txt.parent.exists():
        print('Output parent directory does not exist:', output_txt.parent)
        sys.exit(1)

    if args.model == "gpt-3.5-turbo" or args.model == "text-davinci-003":
        max_tokens = 3050 / 2 # div. by 2 since input+output tokens
    elif args.model == "gpt-4":
        max_tokens = 8050 / 2
    else:
        print('Unknown model:', args.model)
        sys.exit(1)

    chatlines = []
    total_tokens = 0
    raw = input_jsonl.read_text().splitlines()
    for i in range(len(raw)):
        try:
            tmp = json.loads(raw[i])

            # We don't want metadata messages
            if 'mes' not in tmp.keys():
                continue

            # Trim the message down to save tokens
            msg = json.dumps({'name': tmp['name'], 'mes': tmp['mes']})

            # We can't split messages so if one is larger than the context limit we have to quit
            token_count = count_tokens(msg)
            total_tokens += token_count
            if token_count > max_tokens:
                print('Message on line', i + 1, 'is too long at', 'token_count', 'tokens. Max tokens is', max_tokens, 'You need to decide out how to handle this.')
                sys.exit(1)
            chatlines.append(json.loads(msg))
        except json.decoder.JSONDecodeError:
            print(f'JSON decode error on line {i + 1}:')
            sys.exit(1)

    num_chat_messages = len(chatlines)
    print('Total tokens:', total_tokens)

    while len(chatlines):
        ai_input_data = []
        output_data = []
        while True:
            # Check that the message fits in the max tokens
            ai_input_data.append(chatlines[0])
            ai_input = array_of_dicts_to_jsonl(ai_input_data)
            token_count = count_tokens(ai_input)

            if token_count <= max_tokens:
                # Only remove a message if we fit it in the context
                del chatlines[0]
            else:
                # If the message is at the max token count, remove the item we just put in (we will process it next iteration)
                del ai_input_data[0]
                output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
                break

            # If there aren't any more messages to process that means they all fit in the context
            if len(chatlines) == 0:
                output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
                break

        if len(output_data):
            with open(output_txt, 'a') as f:
                for msg in output_data:
                    f.write(f"{msg}\n\n\n")

    print(f'Converted {num_chat_messages} lines.')
    print('Saved to:', output_txt)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import argparse
	import re
	from pathlib import Path
	import sys
	import json
	import openai
	import tiktoken
	import threading
	import traceback

	"""
	Convert SillyTavern jsonl chats to TXT files using AI.

	HOW TO USE:
	1. `pip install tiktoken openai`
	2. Find the chat file you want to convert. It's the `jsonl` file located in `SillyTavern/public/chats/<character name>/`
	3. Run this script with `python3 sillytavern-chat-to-txt.py <path to the jsonl file> <path to where you want to save the TXT file> --key <your OpenAI API key>`

	This uses a temperature of 0 so don't re-run this expecting something different.

	If your chat is larger than the context window it will be sent in batches. After each batch, the response is written to your output file.
	"""

	class TimerThread(threading.Thread):
	def __init__(self, prompt:str='Waiting for response...'):
	super().__init__()
	self._stop_event = threading.Event()
	self.prompt = prompt

	def run(self):
	seconds = 0
	while not self._stop_event.is_set():
	print(f"\r{self.prompt} {seconds}s", end="")
	seconds += 1
	self._stop_event.wait(1)

	def stop(self):
	self._stop_event.set()
	print('')


	def count_tokens(string: str, encoding_name: str = 'cl100k_base', encoding_for_model: str = None) -> int:
	if encoding_for_model:
	enc = tiktoken.encoding_for_model(encoding_for_model)
	else:
	enc = tiktoken.get_encoding(encoding_name)
	num_tokens = len(enc.encode(string))
	return num_tokens

	def array_of_dicts_to_jsonl(array_of_dicts):
	jsonl_string = "\n".join(json.dumps(d) for d in array_of_dicts)
	return jsonl_string

	def send_to_openai(msg, model):
	user_msg = f"I have a jsonl transcript of an internet roleplay session. I need you to strip everything that isn't important to the story and write a summary of each message. For each message, use the format:\n\n```\n<character name>:\n<what the character says and important actions. don't use double linebreaks except for separating characters>```\n\nSome messages include things that other characters say so please organize it accordingly.\n\n{msg}"
	timer_thread = TimerThread(prompt=f'Sending {count_tokens(user_msg)} tokens to the AI...')
	timer_thread.start()
	try:
	response = openai.ChatCompletion.create(
	model=model,
	messages=[
	{"role": "user", "content": user_msg}
	],
	temperature=0,
	)
	except Exception as e:
	print('Exception:', e)
	sys.exit(1)

	timer_thread.stop()
	return response

	def main():
	parser = argparse.ArgumentParser(description='Convert SillyTavern jsonl files to TXT files using AI for importing into the infinite context server.')
	parser.add_argument('filepath', help='The path to the jsonl file to parse')
	parser.add_argument('output_txt', help='The output TXT file to create.')
	parser.add_argument('--key', required=True, help='Your OpenAI API key')
	parser.add_argument('--model', default='gpt-4', help='Name of the OpenAI model to use. GPT-4 seems to work the best for this. Default: gpt-4')
	args = parser.parse_args()

	openai.api_key = args.key

	input_jsonl = Path(args.filepath).expanduser().absolute().resolve()
	output_txt = Path(args.output_txt).expanduser().absolute().resolve()

	# Empty the file since we append to it
	output_txt.unlink()
	output_txt.touch()

	print('Converting chat:', input_jsonl)
	print('Using model:', args.model)

	if not input_jsonl.exists():
	print('Input file does not exist:', input_jsonl)
	sys.exit(1)
	if not output_txt.parent.exists():
	print('Output parent directory does not exist:', output_txt.parent)
	sys.exit(1)

	if args.model == "gpt-3.5-turbo" or args.model == "text-davinci-003":
	max_tokens = 3050 / 2 # div. by 2 since input+output tokens
	elif args.model == "gpt-4":
	max_tokens = 8050 / 2
	else:
	print('Unknown model:', args.model)
	sys.exit(1)

	chatlines = []
	total_tokens = 0
	raw = input_jsonl.read_text().splitlines()
	for i in range(len(raw)):
	try:
	tmp = json.loads(raw[i])

	# We don't want metadata messages
	if 'mes' not in tmp.keys():
	continue

	# Trim the message down to save tokens
	msg = json.dumps({'name': tmp['name'], 'mes': tmp['mes']})

	# We can't split messages so if one is larger than the context limit we have to quit
	token_count = count_tokens(msg)
	total_tokens += token_count
	if token_count > max_tokens:
	print('Message on line', i + 1, 'is too long at', 'token_count', 'tokens. Max tokens is', max_tokens, 'You need to decide out how to handle this.')
	sys.exit(1)
	chatlines.append(json.loads(msg))
	except json.decoder.JSONDecodeError:
	print(f'JSON decode error on line {i + 1}:')
	sys.exit(1)

	num_chat_messages = len(chatlines)
	print('Total tokens:', total_tokens)

	while len(chatlines):
	ai_input_data = []
	output_data = []
	while True:
	# Check that the message fits in the max tokens
	ai_input_data.append(chatlines[0])
	ai_input = array_of_dicts_to_jsonl(ai_input_data)
	token_count = count_tokens(ai_input)

	if token_count <= max_tokens:
	# Only remove a message if we fit it in the context
	del chatlines[0]
	else:
	# If the message is at the max token count, remove the item we just put in (we will process it next iteration)
	del ai_input_data[0]
	output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
	break

	# If there aren't any more messages to process that means they all fit in the context
	if len(chatlines) == 0:
	output_data.append(send_to_openai(ai_input, args.model).choices[0].message.content.strip())
	break

	if len(output_data):
	with open(output_txt, 'a') as f:
	for msg in output_data:
	f.write(f"{msg}\n\n\n")

	print(f'Converted {num_chat_messages} lines.')
	print('Saved to:', output_txt)

	if __name__ == "__main__":
	main()