Last active
September 4, 2023 00:49
-
-
Save usstq/c7cb000e5b76a4ece66299cd358efaa4 to your computer and use it in GitHub Desktop.
ovcausalllm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from optimum.intel import OVModelForCausalLM | |
import time | |
import argparse | |
import os, sys | |
import numpy as np | |
import torch | |
import hashlib | |
import itertools | |
from typing import Dict, Optional, Tuple, Union | |
import ctypes | |
# beam search zero-copy WA | |
class OVModelForCausalLM_opt(OVModelForCausalLM): | |
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): | |
time1 = time.time() | |
if self.time0 is not None: | |
self.latency.append(time1 - self.time0) | |
self.time0 = time1 | |
return super().prepare_inputs_for_generation(input_ids, past_key_values, **kwargs) | |
def begin_latency_record(self): | |
self.time0 = None | |
self.latency = [] | |
def latency_summary(self): | |
N = len(self.latency) | |
if N == 0: | |
return "?" | |
total_2ndTok = 0 | |
for i in range(1, N): | |
total_2ndTok += self.latency[i] | |
return f"{self.latency[0]*1e3:.1f}ms+({total_2ndTok/(N-1)*1e3:.1f} ms)x{N}" | |
def setup(self): | |
# setup common memory area | |
self.beam_idx = (ctypes.c_int * 2048)(0) | |
os.environ['beam_idx_addr'] = str(hex(ctypes.addressof(self.beam_idx))) | |
def clear_beam_idx(self): | |
self.beam_idx[0] = 0 | |
def _reorder_cache( | |
self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor | |
) -> Tuple[Tuple[torch.Tensor]]: | |
""" | |
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or | |
[`~PreTrainedModel.beam_sample`] is called. | |
This is required to match `past_key_values` with the correct beam_idx at every generation step. | |
""" | |
beam_idx_1d = torch.flatten(beam_idx) | |
#print("======beam_idx_1d=", beam_idx_1d) | |
self.beam_idx[0] = beam_idx_1d.shape[0] | |
for i in range(beam_idx_1d.shape[0]): | |
self.beam_idx[i + 1] = beam_idx_1d[i] | |
return past_key_values | |
# print("beam_idx : ", beam_idx) | |
if self.config.model_type == "bloom": | |
return self._reorder_cache_bloom(past_key_values, beam_idx) | |
# from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache | |
return tuple( | |
tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values | |
) | |
parser = argparse.ArgumentParser(prog="ovcausalllm") | |
parser.add_argument('model_id', default="bigscience/bloomz-560m", help = | |
''' | |
EleutherAI/pythia-70m | |
databricks/dolly-v2-2-8b | |
huggyllama/llama-7b | |
bigscience/bloomz-560m | |
EleutherAI/gpt-j-6B | |
''') | |
long_prompt='''The 1973 oil crisis began in October 1973 when the members of the Organization of Arab Petroleum Exporting Countries (OAPEC, consisting of the Arab members of OPEC plus Egypt and Syria) proclaimed an oil embargo. By the end of the embargo in March 1974, the price of oil had risen from US$3 per barrel to nearly $12 globally; US prices were significantly higher. The embargo caused an oil crisis, or "shock", with many short- and long-term effects on global politics and the global economy. It was later called the "first oil shock", followed by the 1979 oil crisis, termed the "second oil shock". So, who proclaimed the oil embargo?''' | |
long_prompt*=100 | |
#long_prompt = "What is a one-sentence summary of the following article? You could go directly into the confessional (provided there's no one else in there or waiting outside), but sometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been (blank) since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy (If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. I detest all of my sins because of thy just punishment. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. You may even walk away just having to say a few meaningful prayers. Take the absolution to heart -- you now have a brand new, clean slate to work with. It'll feel so uplifting! Just to clarify, \"absolution\" means your sins are washed away. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry for what you've done and that you wish for nothing more than to be forgiven. Summary:" | |
parser.add_argument("-t", "--torch", action="store_true") | |
parser.add_argument("-r", "--repeat", type=int, default=1000) | |
parser.add_argument("-p", "--prompt", nargs='+', type=str, default=[long_prompt]) | |
parser.add_argument("-l0", nargs='+', type=int, default=[0]) | |
parser.add_argument("-l", "--len", nargs='+', type=int, default=[100]) | |
parser.add_argument("-b", "--batch", type=int, default=1) | |
parser.add_argument("--export", action="store_true") | |
parser.add_argument("--bf16", action="store_true") | |
parser.add_argument("-v", "--verbose", action="store_true") | |
parser.add_argument("--top_p", type=float, default=0) | |
parser.add_argument("--top_k", type=int, default=0) | |
parser.add_argument("--penalty_alpha", type=float, default=0) # default penalty_alpha==0 means degenrate to top-k | |
parser.add_argument("-nb",'--num_beams', type=int, default=1) | |
args = parser.parse_args() | |
generate_kwargs = {} | |
generate_kwargs["do_sample"] = False | |
if args.top_k > 0: | |
generate_kwargs["top_k"] = args.top_k | |
generate_kwargs["do_sample"] = True | |
if args.penalty_alpha > 0: | |
generate_kwargs["penalty_alpha"] = args.penalty_alpha | |
generate_kwargs.pop("do_sample") | |
if args.top_p > 0: | |
generate_kwargs["top_p"] = args.top_p | |
generate_kwargs["top_k"] = 0 | |
generate_kwargs["do_sample"] = True | |
if args.batch > 1: | |
if len(args.prompt) == 1: | |
args.prompt *= args.batch | |
if len(args.prompt) != args.batch: | |
raise "prompt & batch inconsistent!" | |
model_id = args.model_id | |
if args.export: | |
if (model_id[-1] == '/'): | |
model_id = model_id[:-1] | |
ov_pretrained_model_path = f"/home/tingqian/models/ov-{model_id.split('/')[-1]}" | |
print(f"load pretrained & export & save to {ov_pretrained_model_path}") | |
tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies) | |
model = OVModelForCausalLM.from_pretrained(model_id, export=True, proxies=proxies) | |
model.save_pretrained(ov_pretrained_model_path) | |
tokenizer.save_pretrained(ov_pretrained_model_path) | |
os.exit() | |
proxies = { | |
'http': os.environ['http_proxy'], | |
'https': os.environ['https_proxy'] | |
} | |
print(f"/************** {model_id} ******************/") | |
ov_config={"PERFORMANCE_HINT": "LATENCY", | |
"INFERENCE_PRECISION_HINT" : "bf16" if args.bf16 else "f32", | |
"CPU_DENORMALS_OPTIMIZATION" : "YES", | |
"CACHE_DIR" : None} | |
torch_dtype = torch.bfloat16 if args.bf16 else torch.float32 | |
#ov_config = None | |
if args.torch: | |
print(f"load pretrained pytorch model from {model_id}") | |
print(f"use precision {torch_dtype}") | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id) | |
model = model.to(dtype=torch_dtype) | |
from torch.fx import symbolic_trace | |
symbolic_traced : torch.fx.GraphModule = symbolic_trace(model) | |
print(symbolic_traced.graph) | |
import sys | |
sys.exit() | |
else: | |
print(f"load pretrained ov model from {model_id}") | |
print(f"ov_config={ov_config}") | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = OVModelForCausalLM_opt.from_pretrained(model_id, ov_config=ov_config) | |
model.setup() | |
# https://stackoverflow.com/questions/70544129/transformers-asking-to-pad-but-the-tokenizer-does-not-have-a-padding-token | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
ref_result = '' | |
for question_len, answer_len in itertools.product(args.l0, args.len): | |
for i in range(args.repeat): | |
if question_len > 0: | |
inputs = tokenizer(args.prompt, return_tensors="pt", max_length = question_len, pad_to_max_length = True, return_token_type_ids=False) | |
else: | |
inputs = tokenizer(args.prompt, return_tensors="pt", padding=True, return_token_type_ids=False) | |
#print(inputs) | |
#inputs.pop("token_type_ids", None) | |
actual_question_len = inputs.input_ids.size(1) | |
max_total_len = actual_question_len + answer_len | |
generate_kwargs["min_length"] = max_total_len | |
generate_kwargs["max_length"] = max_total_len | |
generate_kwargs["temperature"] = 0.9 | |
generate_kwargs["pad_token_id"] = tokenizer.eos_token_id | |
if args.num_beams > 1: | |
generate_kwargs["num_beams"] = args.num_beams | |
# print(f"inputs.input_ids.shape={inputs.input_ids.shape} question_len={question_len} actual_question_len={actual_question_len} answer_len={answer_len}") | |
# print(f"max_total_len : {max_total_len}") | |
t0 = time.time() | |
model.clear_beam_idx() | |
model.begin_latency_record() | |
gen_tokens = model.generate(**inputs, **generate_kwargs) | |
t1 = time.time() | |
time.sleep(1) | |
result = tokenizer.batch_decode(gen_tokens) | |
result_str = ';;;'.join(result).encode('utf-8') | |
md5sum = hashlib.md5(result_str).hexdigest() | |
if ref_result != result_str: | |
ref_result = result_str | |
print(f"\n*** Text generated: (with md5sum {md5sum}) ***") | |
for rid, r in enumerate(result): | |
if args.verbose: | |
print(f"[{rid}] : {r}") | |
else: | |
if len(r) > 160: | |
r = r[:160] + "..." | |
print(f"[{rid}] : {[r]}") | |
print(f"round {i}: {actual_question_len}+{answer_len}={len(gen_tokens[0])} tokens {t1-t0:.2f} sec. Per-token latency: {model.latency_summary()} ") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment