Mix.install([
{:bumblebee, github: "elixir-nx/bumblebee", ref: "23de64b1b88ed3aad266025c207f255312b80ba6"},
{:nx, github: "elixir-nx/nx", sparse: "nx", override: true},
{:exla, github: "elixir-nx/nx", sparse: "exla", override: true},
{:axon, "~> 0.5.1"},
{:plug, "~> 1.14.2"},
{:kino, "~> 0.9"},
{:benchee, "~> 1.1"},
{:req, "~> 0.3.9"},
{:vega_lite, "~> 0.1.7"},
{:kino_vega_lite, "~> 0.1.9"}
])
Nx.global_default_backend(EXLA.Backend)
Nx.Defn.global_default_options(compiler: EXLA, client: :host)
alias VegaLite, as: Vl
# Benchee's parallel option is confusing and does not work for our use case,
# as it waits for all concurrent processes to finish before starting the next one.
# Let's define a module ourselves:
defmodule ConcurrentBench do
def run(fun, concurrency \\ System.schedulers_online(), timeout \\ 10_000) do
# use an erlang counter to count the number of function invocations
counter = :counters.new(1, [:write_concurrency])
# returns time in microseconds
{taken, _} =
:timer.tc(fn ->
tasks =
for _i <- 1..concurrency do
Task.async(fn ->
Stream.repeatedly(fn ->
fun.()
# only count after the function ran successfully
:counters.add(counter, 1, 1)
end)
|> Stream.run()
end)
end
results = Task.yield_many(tasks, timeout)
# kill all processes
Enum.map(results, fn {task, res} ->
res || Task.shutdown(task, :brutal_kill)
end)
end)
runs = :counters.get(counter, 1)
ips = runs / (taken / 1_000_000)
%{runs: runs, ips: ips}
end
end
# a helper module that only runs a function when a button is pressed
defmodule RunOnButtonPress do
def run(id, label, fun) do
button =
Kino.Control.button(label)
|> Kino.render()
kill =
Kino.Control.button("Stop execution")
|> Kino.render()
if pid = Process.get({:run_on_button_press, id}) do
Process.exit(pid, :kill)
end
pid =
spawn(fn ->
Kino.Control.subscribe(button, :run)
Kino.Control.subscribe(kill, :kill)
Stream.repeatedly(fn ->
receive do
{:run, _} ->
if pid = Process.get({:run_on_button_press_pid, id}) do
Process.exit(pid, :kill)
end
pid = spawn(fun)
Process.put({:run_on_button_press_pid, id}, pid)
{:kill, _} ->
if pid = Process.get({:run_on_button_press_pid, id}) do
Process.exit(pid, :kill)
end
end
end)
|> Stream.run()
end)
Process.put({:run_on_button_press, id}, pid)
end
end
{:module, RunOnButtonPress, <<70, 79, 82, 49, 0, 0, 12, ...>>, {:run, 3}}
This notebook evaluates the performance of encoding sentences using a sentence-transformerl model, namely the all-MiniLM-L6-v2. We use Bumblebee to get the model and tokenizer:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
{:ok, model_info} = Bumblebee.load_model({:hf, model_name})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
15:19:50.679 [info] TfrtCpuClient created.
{:ok,
%Bumblebee.Text.BertTokenizer{
tokenizer: #Tokenizers.Tokenizer<[
vocab_size: 30522,
continuing_subword_prefix: "##",
max_input_chars_per_word: 100,
model_type: "bpe",
unk_token: "[UNK]"
]>,
special_tokens: %{cls: "[CLS]", mask: "[MASK]", pad: "[PAD]", sep: "[SEP]", unk: "[UNK]"}
}}
Let's define some text that is long enough to fully utilize the model's sequence length of 128:
text =
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr"
We check that the attention mask is all ones:
# check that attention mask in all 128
Bumblebee.apply_tokenizer(tokenizer, [text])["attention_mask"] |> Nx.sum()
#Nx.Tensor<
u64
EXLA.Backend<host:0, 0.2469518244.3382837268.72605>
128
>
Get a list of texts with increasing sequence length for some benchmarks later:
splitted = String.split(text, " ", trim: true)
texts =
for i <- 1..length(splitted) do
Enum.take(splitted, i)
|> Enum.join(" ")
end
["Lorem", "Lorem ipsum", "Lorem ipsum dolor", "Lorem ipsum dolor sit",
"Lorem ipsum dolor sit amet,", "Lorem ipsum dolor sit amet, consetetur",
"Lorem ipsum dolor sit amet, consetetur sadipscing",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr,",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit",
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.",
...]
sequence_lengths =
Enum.map(texts, fn text ->
input = Bumblebee.apply_tokenizer(tokenizer, [text])
input["attention_mask"] |> Nx.sum() |> Nx.to_number()
end)
[4, 6, 8, 9, 12, 16, 20, 23, 25, 27, 30, 34, 36, 39, 40, 42, 43, 45, 46, 49, 52, 54, 56, 60, 61, 63,
65, 66, 69, 70, 72, 73, 74, 75, 76, 80, 82, 84, 86, 90, 91, 92, 95, 97, 98, 100, 102, 104, 105,
108, ...]
Now we benchmark the tokenizer to see if it might have an impact on the overall encode performance.
RunOnButtonPress.run(:tokenizer, "Run benchmark", fn ->
mod = Module.concat(Bench, "Test#{System.unique_integer()}")
defmodule mod do
def run(tokenizer, text) do
Benchee.run(
%{
"tokenize" => fn ->
Bumblebee.apply_tokenizer(tokenizer, [text])
end
},
time: 10,
memory_time: 2
)
end
end
mod.run(tokenizer, text)
|> tap(fn _ ->
:code.purge(mod)
:code.delete(mod)
end)
end)
nil
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.14.5
Erlang 25.3.2.1
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 14 s
Benchmarking tokenize ...
Name ips average deviation median 99th %
tokenize 3.79 K 263.88 μs ±23.18% 248.13 μs 447.56 μs
Memory usage statistics:
Name Memory usage
tokenize 21.34 KB
**All measurements for memory usage were the same**
The tokenizer is very fast with > 3k ips, so we ignore its impact below.
Now we test the raw embedding performance using Axon.predict
.
inputs = Bumblebee.apply_tokenizer(tokenizer, [text])
%{
"attention_mask" => #Nx.Tensor<
u32[1][128]
EXLA.Backend<host:0, 0.2469518244.3383099411.122520>
[
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]
]
>,
"input_ids" => #Nx.Tensor<
u32[1][128]
EXLA.Backend<host:0, 0.2469518244.3383099411.122519>
[
[101, 19544, 2213, 12997, 17421, 2079, 10626, 4133, 2572, 3388, 1010, 9530, 13462, 3388, 3126, 6517, 11514, 11020, 2075, 12005, 16344, 1010, 7367, 2094, 22939, 2213, 2512, 2819, 2100, 1041, 4313, 5302, 2094, 13657, 2099, 1999, 17258, 16671, 21183, 4450, 2063, 3802, 2079, 20186, 20201, 4862, 28940, 14852, ...]
]
>,
"token_type_ids" => #Nx.Tensor<
u32[1][128]
EXLA.Backend<host:0, 0.2469518244.3383099411.122521>
[
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]
]
>
}
embedding = Axon.predict(model_info.model, model_info.params, inputs, compiler: EXLA)
%{
attentions: #Axon.None<...>,
cache: #Axon.None<...>,
cross_attentions: #Axon.None<...>,
hidden_state: #Nx.Tensor<
f32[1][128][384]
EXLA.Backend<host:0, 0.2469518244.3383099411.122522>
[
[
[0.2439754456281662, -0.04258212447166443, 0.44050031900405884, -0.02710575982928276, -0.11495500802993774, 0.02201808989048004, -0.07110054790973663, -0.09002304822206497, 0.14736135303974152, -0.05301731452345848, 0.10864540189504623, 0.08479207754135132, 0.1325072944164276, -0.6286611557006836, 0.005330087151378393, -0.06936373561620712, 0.041035961359739304, 0.2797529697418213, 0.2678096294403076, 0.22232460975646973, -0.33385011553764343, 0.1591491848230362, -0.18547852337360382, -0.026528071612119675, -0.42582497000694275, 0.04235398396849632, -0.13319902122020721, 0.07178732007741928, 0.20993556082248688, -1.0995397567749023, -0.33737459778785706, -0.04268856346607208, 0.22351738810539246, 0.01151223387569189, -0.15966778993606567, 0.09978454560041428, 0.21272403001785278, -0.1424511969089508, -0.2455434799194336, -0.23965565860271454, 0.03146129846572876, -0.29717034101486206, 0.0779552012681961, 0.10010133683681488, 0.1790347397327423, 0.06603735685348511, ...],
...
]
]
>,
hidden_states: #Axon.None<...>,
pooled_state: #Nx.Tensor<
f32[1][384]
EXLA.Backend<host:0, 0.2469518244.3383099411.122523>
[
[0.06754760444164276, 6.4690230647102e-4, 0.003184225410223007, 0.0166630782186985, -0.10242754966020584, -0.003044932149350643, 0.028929710388183594, 0.04025515913963318, -0.07270049303770065, 0.028964964672923088, -0.01476008165627718, 0.016810687258839607, -0.02237570285797119, -0.01408410258591175, -0.06558677554130554, 0.0427701361477375, 0.16805700957775116, -0.06141290441155434, -0.02551720291376114, 0.07080098986625671, -0.010854755528271198, -0.009101212956011295, 0.025624696165323257, -0.0356527604162693, -0.009555971249938011, 0.06418594717979431, -0.04090864211320877, -0.05694327503442764, 0.04082518815994263, 0.10054861009120941, 0.044315703213214874, 0.05997159704566002, -0.09621551632881165, -0.023480303585529327, 0.014569595456123352, -0.03468870744109154, -0.04913468286395073, 0.07613613456487656, -0.07787314057350159, -0.046912647783756256, 0.08328253775835037, -0.026116790249943733, -0.09211680293083191, 0.1202111542224884, ...]
]
>
}
Raw predict without batching:
RunOnButtonPress.run(:axon_predict, "Run benchmark", fn ->
mod = Module.concat(Bench, "Test#{System.unique_integer()}")
defmodule mod do
def run(model_info, inputs) do
Benchee.run(
%{
"predict" => fn ->
Axon.predict(model_info.model, model_info.params, inputs, compiler: EXLA)
end
},
time: 10,
memory_time: 2
)
end
end
mod.run(model_info, inputs)
|> tap(fn _ ->
:code.purge(mod)
:code.delete(mod)
end)
end)
nil
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.14.5
Erlang 25.3.2.1
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 14 s
Benchmarking predict ...
Name ips average deviation median 99th %
predict 45.87 21.80 ms ±6.46% 21.54 ms 28.12 ms
Memory usage statistics:
Name Memory usage
predict 632.12 KB
**All measurements for memory usage were the same**
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.14.5
Erlang 25.3.2.1
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 14 s
Benchmarking predict ...
Name ips average deviation median 99th %
predict 45.08 22.18 ms ±6.10% 21.92 ms 27.66 ms
Memory usage statistics:
Name Memory usage
predict 632.12 KB
**All measurements for memory usage were the same**
This achives ~45 encodes per second on my MacBook.
Let's define a simple serving that executes the predict function to test batching:
defmodule Serving do
import Nx.Defn
defn mean_pooling(model_output, attention_mask) do
input_mask_expanded = Nx.new_axis(attention_mask, -1)
model_output
|> Nx.multiply(input_mask_expanded)
|> Nx.sum(axes: [1])
|> Nx.divide(Nx.sum(input_mask_expanded, axes: [1]))
end
def serving(model_info, tokenizer) do
mean_pool = Nx.Defn.jit(&mean_pooling/2, compiler: EXLA)
Nx.Serving.new(fn _compiler_options ->
fn inputs ->
Axon.predict(model_info.model, model_info.params, inputs, compiler: EXLA)
end
end)
|> Nx.Serving.client_preprocessing(fn input when is_list(input) ->
inputs = Bumblebee.apply_tokenizer(tokenizer, input)
# tokenize again, but limit the output vector to the sequence length of the text
seq_length = inputs["attention_mask"] |> Nx.sum() |> Nx.to_number()
inputs = Bumblebee.apply_tokenizer(tokenizer, input, length: seq_length)
{Nx.Batch.concatenate([inputs]), inputs}
end)
|> Nx.Serving.client_postprocessing(fn embeddings, _server_info, _client_info = inputs ->
mean_pool.(embeddings.hidden_state, inputs["attention_mask"])
end)
end
end
{:module, Serving, <<70, 79, 82, 49, 0, 0, 18, ...>>, {:serving, 2}}
serving = Serving.serving(model_info, tokenizer)
%Nx.Serving{
module: Nx.Serving.Default,
arg: #Function<2.126123788/1 in Serving.serving/2>,
client_preprocessing: #Function<3.126123788/1 in Serving.serving/2>,
client_postprocessing: #Function<4.126123788/3 in Serving.serving/2>,
distributed_postprocessing: &Function.identity/1,
process_options: [],
defn_options: [compiler: EXLA, client: :host]
}
Nx.Serving.run(serving, [text])
#Nx.Tensor<
f32[1][384]
EXLA.Backend<host:0, 0.2469518244.3383099411.125071>
[
[0.07581919431686401, 0.09831660985946655, 0.13872618973255157, -0.05035487189888954, -0.13231438398361206, 0.024065986275672913, 0.09062023460865021, 0.0558452382683754, 0.08619263023138046, 0.018240230157971382, 0.10397972911596298, -0.0745587944984436, 0.052109457552433014, -0.1966744065284729, -0.059412822127342224, -0.03932008147239685, 0.0159605722874403, 0.1776045709848404, 0.02290552295744419, -1.755962148308754e-4, 0.014708181843161583, 0.043549295514822006, -0.027428843080997467, 0.07471135258674622, -0.13056571781635284, 0.011731253005564213, 0.033480431884527206, 0.013122644275426865, 0.0908602923154831, -0.09910072386264801, -0.01821049675345421, 0.08142084628343582, 0.12315520644187927, 0.052727632224559784, -0.005705366376787424, 0.07481136173009872, 0.0704440101981163, -0.15515460073947906, -0.04733641445636749, -0.03984561562538147, -0.009945175610482693, -0.0984136313199997, -0.0319361612200737, -0.049165405333042145, 0.019830983132123947, 0.0258416049182415, -0.023530248552560806, 0.06172958016395569, 0.0617603063583374, 0.010934718884527683, ...]
]
>
The performance using a single process should be around 1000/50 = 20 encodes per second, based on the configured timeout:
RunOnButtonPress.run(:nx_serving, "Run benchmark", fn ->
{:ok, pid} =
Kino.start_child(
{Nx.Serving, serving: serving, name: MyServing, batch_size: 32, batch_timeout: 50}
)
mod = Module.concat(Bench, "Test#{System.unique_integer()}")
defmodule mod do
def run(text) do
Benchee.run(
%{
"batched_predict" => fn ->
Nx.Serving.batched_run(MyServing, [text])
end
},
time: 10,
memory_time: 2
)
end
end
mod.run(text)
|> tap(fn _ ->
:code.purge(mod)
:code.delete(mod)
end)
|> tap(fn _ ->
Kino.terminate_child(pid)
end)
end)
nil
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.14.5
Erlang 25.3.2.1
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 14 s
Benchmarking batched_predict ...
Name ips average deviation median 99th %
batched_predict 12.82 78.03 ms ±2.20% 77.73 ms 83.37 ms
Memory usage statistics:
Name Memory usage
batched_predict 63.67 KB
**All measurements for memory usage were the same**
On my MacBook this achieves ~13 encodes per second.
We'll use the ConcurrentBench module defined in the setup step to run concurrent benchmarks. Let's also define a helper module to start the serving using the correct configuration and run a benchmark using a specific concurrency.
defmodule BenchTest do
def run(serving, text, batch_size, batch_timeout, concurrency, timeout \\ 10_000) do
{:ok, pid} =
Kino.start_child(
{Nx.Serving,
serving: serving, name: MyServing, batch_size: batch_size, batch_timeout: batch_timeout}
)
mod = Module.concat(Bench, "Test#{System.unique_integer()}")
defmodule mod do
def run(text, concurrency, timeout) do
ConcurrentBench.run(
fn ->
Nx.Serving.batched_run(MyServing, [text])
end,
concurrency,
timeout
)
end
end
result =
mod.run(text, concurrency, timeout)
|> tap(fn _ ->
:code.purge(mod)
:code.delete(mod)
end)
|> IO.inspect(
label: "batch: #{batch_size}; concurrency: #{concurrency}, timeout: #{batch_timeout}"
)
Kino.terminate_child(pid)
result
end
end
{:module, BenchTest, <<70, 79, 82, 49, 0, 0, 23, ...>>, {:run, 6}}
RunOnButtonPress.run(:concurrent_serving, "Run benchmark", fn ->
# this is the best combination of parameters I found using the cell below
batch_size = 64
batch_timeout = 50
concurrency = 64
serving = Serving.serving(model_info, tokenizer)
BenchTest.run(serving, text, batch_size, batch_timeout, concurrency)
end)
nil
batch: 64; concurrency: 64, timeout: 50: %{ips: 108.76860937933313, runs: 1088}
On my MacBook this achieves ~110 encodes per second.
I used the following code to test different batch sizes and timeouts:
# serving = MyMod.serving(model_info, tokenizer)
# for batch_size <- [1,2,4,8,16,32,64,128,256], concurrency <- [8,16,32,64,128,256,512,1024], batch_timeout <- [1, 5, 10, 20, 50, 100, 200] do
# BenchTest.run(serving, text, batch_size, batch_timeout, concurrency)
# end
nil
Let's plot the invocations per second by the sequence length of the input:
chart =
Vl.new(width: 1280, height: 720)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "sequence_length", type: :quantitative)
|> Vl.encode_field(:y, "ips", type: :quantitative)
|> Kino.VegaLite.new()
RunOnButtonPress.run(:sequence_length_graph, "Generate Graph", fn ->
batch_size = 64
batch_timeout = 50
concurrency = 64
serving = Serving.serving(model_info, tokenizer)
for {text, sequence_length} <- Enum.zip(texts, sequence_lengths) do
%{ips: ips} = BenchTest.run(serving, text, batch_size, batch_timeout, concurrency, 5_000)
Kino.VegaLite.push(chart, %{ips: ips, sequence_length: sequence_length})
end
end)
nil
batch: 64; concurrency: 64, timeout: 50: %{ips: 1266.7290301465916, runs: 6336}
batch: 64; concurrency: 64, timeout: 50: %{ips: 1023.3061983974866, runs: 5120}
batch: 64; concurrency: 64, timeout: 50: %{ips: 946.6301286625452, runs: 4736}
batch: 64; concurrency: 64, timeout: 50: %{ips: 882.4731950765349, runs: 4416}
batch: 64; concurrency: 64, timeout: 50: %{ips: 780.1914506684786, runs: 3904}
batch: 64; concurrency: 64, timeout: 50: %{ips: 652.4626767960964, runs: 3264}
batch: 64; concurrency: 64, timeout: 50: %{ips: 562.8532823780552, runs: 2816}
batch: 64; concurrency: 64, timeout: 50: %{ips: 524.5742232543114, runs: 2624}
batch: 64; concurrency: 64, timeout: 50: %{ips: 511.5028192596796, runs: 2560}
batch: 64; concurrency: 64, timeout: 50: %{ips: 473.0722406083773, runs: 2368}
batch: 64; concurrency: 64, timeout: 50: %{ips: 447.6020817493249, runs: 2240}
batch: 64; concurrency: 64, timeout: 50: %{ips: 396.516490709143, runs: 1984}
batch: 64; concurrency: 64, timeout: 50: %{ips: 383.57975002587165, runs: 1920}
batch: 64; concurrency: 64, timeout: 50: %{ips: 358.06492284520147, runs: 1792}
batch: 64; concurrency: 64, timeout: 50: %{ips: 358.07815935037587, runs: 1792}
batch: 64; concurrency: 64, timeout: 50: %{ips: 345.2631612598749, runs: 1728}
batch: 64; concurrency: 64, timeout: 50: %{ips: 345.4482100565012, runs: 1728}
batch: 64; concurrency: 64, timeout: 50: %{ips: 332.3640048983743, runs: 1664}
batch: 64; concurrency: 64, timeout: 50: %{ips: 319.78721358807854, runs: 1600}
batch: 64; concurrency: 64, timeout: 50: %{ips: 307.02254097131856, runs: 1536}
batch: 64; concurrency: 64, timeout: 50: %{ips: 294.076692084722, runs: 1472}
batch: 64; concurrency: 64, timeout: 50: %{ips: 281.34335858829576, runs: 1408}
batch: 64; concurrency: 64, timeout: 50: %{ips: 268.4104291031996, runs: 1344}
batch: 64; concurrency: 64, timeout: 50: %{ips: 255.91130114302385, runs: 1280}
batch: 64; concurrency: 64, timeout: 50: %{ips: 242.92961933368161, runs: 1216}
batch: 64; concurrency: 64, timeout: 50: %{ips: 230.238372662391, runs: 1152}
batch: 64; concurrency: 64, timeout: 50: %{ips: 242.98850280715666, runs: 1216}
batch: 64; concurrency: 64, timeout: 50: %{ips: 230.18928472875928, runs: 1152}
batch: 64; concurrency: 64, timeout: 50: %{ips: 204.64201636336747, runs: 1024}
batch: 64; concurrency: 64, timeout: 50: %{ips: 217.2259369365952, runs: 1088}
batch: 64; concurrency: 64, timeout: 50: %{ips: 217.3457489428866, runs: 1088}
batch: 64; concurrency: 64, timeout: 50: %{ips: 217.44769963117832, runs: 1088}
batch: 64; concurrency: 64, timeout: 50: %{ips: 217.46482386548522, runs: 1088}
batch: 64; concurrency: 64, timeout: 50: %{ips: 204.58739278142156, runs: 1024}
batch: 64; concurrency: 64, timeout: 50: %{ips: 204.68615356138918, runs: 1024}
batch: 64; concurrency: 64, timeout: 50: %{ips: 204.65894905231315, runs: 1024}
batch: 64; concurrency: 64, timeout: 50: %{ips: 191.83103522417454, runs: 960}
batch: 64; concurrency: 64, timeout: 50: %{ips: 191.82804534015708, runs: 960}
batch: 64; concurrency: 64, timeout: 50: %{ips: 178.95493910638774, runs: 896}
batch: 64; concurrency: 64, timeout: 50: %{ips: 179.12906489030345, runs: 896}
batch: 64; concurrency: 64, timeout: 50: %{ips: 166.2415385654394, runs: 832}
batch: 64; concurrency: 64, timeout: 50: %{ips: 167.93853449637433, runs: 840}
batch: 64; concurrency: 64, timeout: 50: %{ips: 166.28596108788594, runs: 832}
batch: 64; concurrency: 64, timeout: 50: %{ips: 166.18077432251374, runs: 832}
batch: 64; concurrency: 64, timeout: 50: %{ips: 153.10977312839748, runs: 768}
batch: 64; concurrency: 64, timeout: 50: %{ips: 153.32034369310378, runs: 768}
batch: 64; concurrency: 64, timeout: 50: %{ips: 140.33753169804226, runs: 704}
batch: 64; concurrency: 64, timeout: 50: %{ips: 140.65349531927546, runs: 704}
batch: 64; concurrency: 64, timeout: 50: %{ips: 140.51155787399625, runs: 704}
batch: 64; concurrency: 64, timeout: 50: %{ips: 140.7021276000414, runs: 704}
batch: 64; concurrency: 64, timeout: 50: %{ips: 140.7430834970338, runs: 704}
batch: 64; concurrency: 64, timeout: 50: %{ips: 127.84584348192949, runs: 640}
batch: 64; concurrency: 64, timeout: 50: %{ips: 127.72551786211432, runs: 640}
batch: 64; concurrency: 64, timeout: 50: %{ips: 127.88055955737342, runs: 640}
batch: 64; concurrency: 64, timeout: 50: %{ips: 127.67567824212935, runs: 640}
batch: 64; concurrency: 64, timeout: 50: %{ips: 114.82436357699422, runs: 576}
batch: 64; concurrency: 64, timeout: 50: %{ips: 115.02835468913287, runs: 576}
batch: 64; concurrency: 64, timeout: 50: %{ips: 115.01654860497506, runs: 576}
Let's try the official TextEmbedding serving:
serving =
Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
compile: [batch_size: 32, sequence_length: 128],
defn_options: [compiler: EXLA],
output_attribute: :hidden_state,
output_pool: :mean_pooling
)
%Nx.Serving{
module: Nx.Serving.Default,
arg: #Function<1.45271292/1 in Bumblebee.Text.TextEmbedding.text_embedding/3>,
client_preprocessing: #Function<2.45271292/1 in Bumblebee.Text.TextEmbedding.text_embedding/3>,
client_postprocessing: #Function<3.45271292/3 in Bumblebee.Text.TextEmbedding.text_embedding/3>,
distributed_postprocessing: &Function.identity/1,
process_options: [batch_size: 32],
defn_options: [client: :host, compiler: EXLA]
}
[%{embedding: bumblebee_vec}] = Nx.Serving.run(serving, ["test"])
[
%{
embedding: #Nx.Tensor<
f32[384]
EXLA.Backend<host:0, 0.2469518244.3383099408.7029>
[0.07298431545495987, 0.15851451456546783, -0.23144972324371338, 0.3741159439086914, -0.045083049684762955, -0.25977984070777893, 0.48613056540489197, 0.23612074553966522, 0.07850591093301773, -0.038579411804676056, 0.10742209106683731, -0.48567596077919006, -0.0024859558325260878, 0.1760004609823227, -0.10083096474409103, -0.4305591285228729, 0.056028734892606735, -0.12789513170719147, -0.5067673325538635, -0.08244767785072327, -0.25918564200401306, -0.16331887245178223, -0.16735832393169403, 0.20843519270420074, -0.13923603296279907, 0.1327216774225235, -0.3652689754962921, 0.20778203010559082, 0.1873413324356079, -0.3925524055957794, 0.24460601806640625, 0.20174042880535126, 0.09667900204658508, 0.28571608662605286, 0.33517149090766907, 0.08425561338663101, 0.25997352600097656, 0.17747525870800018, 0.12233105301856995, -0.02050975151360035, -0.02277998812496662, -0.9009084701538086, 0.24008531868457794, -0.06883982568979263, 0.1645544469356537, 0.2608881890773773, -0.10099929571151733, 0.33776208758354187, ...]
>
}
]
our_vec = Nx.Serving.run(Serving.serving(model_info, tokenizer), ["test"])
#Nx.Tensor<
f32[1][384]
EXLA.Backend<host:0, 0.2469518244.3383099408.7041>
[
[0.07298391312360764, 0.15851463377475739, -0.23144972324371338, 0.3741157054901123, -0.04508311673998833, -0.2597799301147461, 0.48613032698631287, 0.23612110316753387, 0.07850580662488937, -0.03857932984828949, 0.1074218824505806, -0.4856758117675781, -0.0024859358090907335, 0.17600063979625702, -0.10083112865686417, -0.4305591285228729, 0.05602874234318733, -0.12789462506771088, -0.5067673325538635, -0.08244790136814117, -0.25918570160865784, -0.163318932056427, -0.1673581749200821, 0.20843501389026642, -0.1392362266778946, 0.1327214539051056, -0.36526891589164734, 0.2077821046113968, 0.18734145164489746, -0.3925524950027466, 0.24460570514202118, 0.201740562915802, 0.09667924046516418, 0.2857159376144409, 0.33517202734947205, 0.08425549417734146, 0.25997331738471985, 0.17747533321380615, 0.12233075499534607, -0.020509907975792885, -0.02278018556535244, -0.9009080529212952, 0.2400854378938675, -0.06883969157934189, 0.1645541936159134, 0.26088789105415344, -0.10099910944700241, 0.3377624452114105, -0.3585679233074188, 0.07723064720630646, ...]
]
>
Nx.subtract(bumblebee_vec, our_vec)
|> Nx.sum()
#Nx.Tensor<
f32
EXLA.Backend<host:0, 0.2469518244.3383099408.7043>
-3.0419323593378067e-7
>
Now benchmark the Bumblebee serving:
RunOnButtonPress.run(:bumblebee_serving, "Run benchmark", fn ->
batch_size = 32
batch_timeout = 50
concurrency = 256
BenchTest.run(serving, text, batch_size, batch_timeout, concurrency)
end)
nil
text_a = "the cat is fluffy"
text_b = "the dog is fluffy"
"the dog is fluffy"
Encoding the two words "test" and "test2" using Python leads to a cosine similarity of 0.7277918
. Let's compare:
our_result = Nx.Serving.run(Serving.serving(model_info, tokenizer), [text_a, text_b])
bumblebee_result = Nx.Serving.run(serving, [text_a, text_b]) |> Enum.map(& &1.embedding)
[
#Nx.Tensor<
f32[384]
EXLA.Backend<host:0, 0.2469518244.3383099408.7067>
[0.42453137040138245, 0.04793732985854149, 0.23207329213619232, 0.31005850434303284, -0.44257259368896484, -0.07288091629743576, 0.24788761138916016, -0.15513823926448822, -0.38914236426353455, 0.11603856086730957, -0.2815646231174469, -0.4112638533115387, 0.10961031168699265, 0.45908474922180176, -0.1510944813489914, -0.15521377325057983, -0.32790467143058777, -0.1445486694574356, -0.04097413644194603, 0.2824035882949829, 0.11564195156097412, 0.1867489218711853, 0.03591810539364815, -0.0682014599442482, -0.2890753746032715, 0.014984901063144207, -0.26554152369499207, -0.4974195957183838, -0.0337618924677372, -0.1758141964673996, -0.2900819182395935, -0.004172794055193663, -0.18819290399551392, 0.3920232057571411, -0.13242138922214508, -0.4983557164669037, 0.08606203645467758, -0.20720718801021576, 0.25369980931282043, 0.4534916877746582, -0.3977705240249634, -0.10881897062063217, -0.1350875347852707, -0.07513760030269623, -0.13268141448497772, -0.05489715561270714, 0.4929731786251068, -0.39461562037467957, 0.18987540900707245, ...]
>,
#Nx.Tensor<
f32[384]
EXLA.Backend<host:0, 0.2469518244.3383099408.7069>
[0.006419365759938955, -0.11066964268684387, 0.3840847909450531, 0.5104355216026306, -0.1789720058441162, -0.032623715698719025, 0.22227151691913605, -0.24957036972045898, -0.12879173457622528, 0.22260363399982452, -0.2272268533706665, -0.46199890971183777, 0.4086662232875824, 0.48196661472320557, -0.12417189031839371, -0.04760630801320076, -0.07500839978456497, -0.2633509635925293, -0.007205649744719267, -0.13666996359825134, -0.09921097755432129, 0.19720016419887543, 0.030606938526034355, -0.17318342626094818, -0.3837771415710449, -0.2333124876022339, 0.03461345657706261, -0.5993849635124207, 0.06604061275720596, -0.3581312894821167, -0.1961904615163803, -0.06256885081529617, -0.2888460159301758, 0.25639280676841736, 0.059220243245363235, -0.24551276862621307, 0.2712719738483429, 0.006484154146164656, 0.25952595472335815, 0.25420212745666504, -0.015437237918376923, 0.17392598092556, 0.015872417017817497, -0.28491270542144775, -0.13758261501789093, -0.1078195869922638, -0.07086142152547836, -0.43071842193603516, ...]
>
]
Bumblebee.Utils.Nx.cosine_similarity(our_result[0], our_result[1])
#Nx.Tensor<
f32
EXLA.Backend<host:0, 0.2469518244.3383099408.7076>
0.7277917265892029
>
Bumblebee.Utils.Nx.cosine_similarity(
bumblebee_result |> Enum.at(0),
bumblebee_result |> Enum.at(1)
)
#Nx.Tensor<
f32
EXLA.Backend<host:0, 0.2469518244.3383099408.7079>
0.7277917265892029
>
👍
This section assumes that the python3
binary is available in your search path.
Let's use Python for calculating the embedding instead and compare our results.
tmp_dir = System.tmp_dir!() <> "livebook_embedding_py"
"/var/folders/m7/xttdjkqd24dcqqmy5gv1gddm0000gp/T/livebook_embedding_py"
File.mkdir(tmp_dir)
{:error, :eexist}
We install the dependencies for our Python script.
System.shell("python3 -m venv .venv", cd: tmp_dir, into: IO.binstream())
System.shell(
"pip3 install sentence-transformers flask gunicorn",
cd: tmp_dir,
env: [
{"VIRTUAL_ENV", Path.join([tmp_dir, ".venv"])},
{"PATH", "#{Path.join([tmp_dir, ".venv", "bin"])}:#{System.get_env("PATH")}"}
],
into: IO.binstream()
)
Collecting sentence-transformers
Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting flask
Using cached Flask-2.3.2-py3-none-any.whl (96 kB)
Collecting gunicorn
Using cached gunicorn-20.1.0-py3-none-any.whl (79 kB)
Collecting transformers<5.0.0,>=4.6.0
Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting tqdm
Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting torch>=1.6.0
Using cached torch-2.0.1-cp311-none-macosx_11_0_arm64.whl (55.8 MB)
Collecting torchvision
Using cached torchvision-0.15.2-cp311-cp311-macosx_11_0_arm64.whl (1.4 MB)
Collecting numpy
Using cached numpy-1.25.0-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
Collecting scikit-learn
Using cached scikit_learn-1.2.2-cp311-cp311-macosx_12_0_arm64.whl (8.4 MB)
Collecting scipy
Using cached scipy-1.10.1-cp311-cp311-macosx_12_0_arm64.whl (28.7 MB)
Collecting nltk
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
Using cached sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl (1.2 MB)
Collecting huggingface-hub>=0.4.0
Using cached huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
Collecting Werkzeug>=2.3.3
Using cached Werkzeug-2.3.6-py3-none-any.whl (242 kB)
Collecting Jinja2>=3.1.2
Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting itsdangerous>=2.1.2
Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Collecting click>=8.1.3
Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting blinker>=1.6.2
Using cached blinker-1.6.2-py3-none-any.whl (13 kB)
Requirement already satisfied: setuptools>=3.0 in ./.venv/lib/python3.11/site-packages (from gunicorn) (67.6.1)
Collecting filelock
Using cached filelock-3.12.2-py3-none-any.whl (10 kB)
Collecting fsspec
Using cached fsspec-2023.6.0-py3-none-any.whl (163 kB)
Collecting requests
Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting pyyaml>=5.1
Using cached PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl (167 kB)
Collecting typing-extensions>=3.7.4.3
Using cached typing_extensions-4.6.3-py3-none-any.whl (31 kB)
Collecting packaging>=20.9
Using cached packaging-23.1-py3-none-any.whl (48 kB)
Collecting MarkupSafe>=2.0
Using cached MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_universal2.whl (17 kB)
Collecting sympy
Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting networkx
Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting regex!=2019.12.17
Using cached regex-2023.6.3-cp311-cp311-macosx_11_0_arm64.whl (288 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
Using cached tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl (3.9 MB)
Collecting safetensors>=0.3.1
Using cached safetensors-0.3.1-cp311-cp311-macosx_12_0_arm64.whl (401 kB)
Collecting joblib
Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting pillow!=8.3.*,>=5.3.0
Using cached Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl (3.1 MB)
Collecting charset-normalizer<4,>=2
Using cached charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl (121 kB)
Collecting idna<4,>=2.5
Using cached idna-3.4-py3-none-any.whl (61 kB)
Collecting urllib3<3,>=1.21.1
Using cached urllib3-2.0.3-py3-none-any.whl (123 kB)
Collecting certifi>=2017.4.17
Using cached certifi-2023.5.7-py3-none-any.whl (156 kB)
Collecting mpmath>=0.19
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: tokenizers, sentencepiece, safetensors, mpmath, urllib3, typing-extensions, tqdm, threadpoolctl, sympy, regex, pyyaml, pillow, packaging, numpy, networkx, MarkupSafe, joblib, itsdangerous, idna, gunicorn, fsspec, filelock, click, charset-normalizer, certifi, blinker, Werkzeug, scipy, requests, nltk, Jinja2, torch, scikit-learn, huggingface-hub, flask, transformers, torchvision, sentence-transformers
Successfully installed Jinja2-3.1.2 MarkupSafe-2.1.3 Werkzeug-2.3.6 blinker-1.6.2 certifi-2023.5.7 charset-normalizer-3.1.0 click-8.1.3 filelock-3.12.2 flask-2.3.2 fsspec-2023.6.0 gunicorn-20.1.0 huggingface-hub-0.15.1 idna-3.4 itsdangerous-2.1.2 joblib-1.2.0 mpmath-1.3.0 networkx-3.1 nltk-3.8.1 numpy-1.25.0 packaging-23.1 pillow-9.5.0 pyyaml-6.0 regex-2023.6.3 requests-2.31.0 safetensors-0.3.1 scikit-learn-1.2.2 scipy-1.10.1 sentence-transformers-2.2.2 sentencepiece-0.1.99 sympy-1.12 threadpoolctl-3.1.0 tokenizers-0.13.3 torch-2.0.1 torchvision-0.15.2 tqdm-4.65.0 transformers-4.30.2 typing-extensions-4.6.3 urllib3-2.0.3
{%IO.Stream{device: :standard_io, raw: true, line_or_bytes: :line}, 0}
File.write!(tmp_dir <> "/simple.py", """
from flask import Flask, jsonify, request
from sentence_transformers import SentenceTransformer
app = Flask(__name__)
model = SentenceTransformer("all-MiniLM-L6-v2")
@app.route("/")
def encode():
model.encode("#{text}")
return "ok"
@app.route("/result", methods=["POST"])
def result():
text = request.stream.read().decode("utf-8")
[query_embed] = model.encode([text])
return jsonify(query_embed.tolist())
""")
:ok
{:ok, pid} =
Kino.start_child(
{Task,
fn ->
port =
Port.open(
{:spawn, "gunicorn -w 8 -b 0.0.0.0:5001 simple:app"},
[
:binary,
:use_stdio,
:stderr_to_stdout,
{:cd, tmp_dir},
{:env,
[
{'VIRTUAL_ENV', Path.join([tmp_dir, ".venv"]) |> String.to_charlist()},
{'PATH',
"#{Path.join([tmp_dir, ".venv", "bin"])}:#{System.get_env("PATH")}"
|> String.to_charlist()}
]}
]
)
info = Port.info(port)
spawn(fn ->
ref = Port.monitor(port)
receive do
{:DOWN, ^ref, :port, _, _} ->
System.shell("kill #{Keyword.fetch!(info, :os_pid)}")
:ok
end
end)
Stream.repeatedly(fn ->
receive do
{^port, {:data, data}} -> IO.puts(data)
end
end)
|> Stream.run()
end}
)
{:ok, #PID<0.22021.1>}
[2023-06-25 19:06:03 +0200] [8575] [INFO] Starting gunicorn 20.1.0
[2023-06-25 19:06:03 +0200] [8575] [INFO] Listening at: http://0.0.0.0:5001 (8575)
[2023-06-25 19:06:03 +0200] [8575] [INFO] Using worker: sync
[2023-06-25 19:06:03 +0200] [8576] [INFO] Booting worker with pid: 8576
[2023-06-25 19:06:03 +0200] [8577] [INFO] Booting worker with pid: 8577
[2023-06-25 19:06:03 +0200] [8578] [INFO] Booting worker with pid: 8578
[2023-06-25 19:06:03 +0200] [8579] [INFO] Booting worker with pid: 8579
[2023-06-25 19:06:03 +0200] [8580] [INFO] Booting worker with pid: 8580
[2023-06-25 19:06:03 +0200] [8581] [INFO] Booting worker with pid: 8581
[2023-06-25 19:06:04 +0200] [8584] [INFO] Booting worker with pid: 8584
[2023-06-25 19:06:04 +0200] [8585] [INFO] Booting worker with pid: 8585
Benchmark the Python code by measuring the number of HTTP requests we can send. This of course has an overhead, you can also try using wrk on the CLI:
wrk http://127.0.0.1:5001 -t 8 -c 16 -d 10
RunOnButtonPress.run(:python_bench, "Run benchmark", fn ->
Benchee.run(
%{
"python" => fn ->
Req.get!("http://127.0.0.1:5001")
end
},
time: 10,
memory_time: 2
)
end)
nil
text_a = "the cat is fluffy"
text_b = "the dog is fluffy"
"the dog is fluffy"
vec_a = Req.post!("http://127.0.0.1:5001/result", body: text_a).body |> Nx.tensor()
#Nx.Tensor<
f32[384]
EXLA.Backend<host:0, 0.2469518244.3382837278.175301>
[0.06779886782169342, 0.007655660156160593, 0.03706275299191475, 0.049517177045345306, -0.07068011909723282, -0.011639258824288845, 0.03958830237388611, -0.0247760321944952, -0.06214709207415581, 0.018531648442149162, -0.044966667890548706, -0.06568001955747604, 0.017505094408988953, 0.07331708073616028, -0.024130230769515038, -0.02478799968957901, -0.05236723646521568, -0.023084810003638268, -0.006543673574924469, 0.0451006144285202, 0.01846838742494583, 0.029824305325746536, 0.005736138205975294, -0.010892089456319809, -0.046166203916072845, 0.0023931306786835194, -0.04240777716040611, -0.07943931221961975, -0.0053918794728815556, -0.028077997267246246, -0.04632686451077461, -6.664271932095289e-4, -0.030054941773414612, 0.06260722875595093, -0.021148040890693665, -0.07958880066871643, 0.013744302093982697, -0.03309154137969017, 0.040516577661037445, 0.07242394238710403, -0.06352505087852478, -0.017378678545355797, -0.02157386764883995, -0.011999706737697124, -0.021189572289586067, -0.00876727793365717, 0.07872916013002396, -0.06302112340927124, 0.030323605984449387, -0.017315112054347992, ...]
>
vec_b = Req.post!("http://127.0.0.1:5001/result", body: text_b).body |> Nx.tensor()
#Nx.Tensor<
f32[384]
EXLA.Backend<host:0, 0.2469518244.3382837278.175302>
[0.0010527605190873146, -0.01814900152385235, 0.06298713386058807, 0.08370770514011383, -0.029350081458687782, -0.005350092891603708, 0.03645104169845581, -0.04092768579721451, -0.021120892837643623, 0.036505334079265594, -0.03726349025964737, -0.07576442509889603, 0.06701824069023132, 0.07903903722763062, -0.0203632190823555, -0.00780709832906723, -0.012300821952521801, -0.04318765923380852, -0.0011817753547802567, -0.022412795573472977, -0.016269907355308533, 0.0323394276201725, 0.005019240081310272, -0.02840074896812439, -0.06293667107820511, -0.03826148062944412, 0.005676363594830036, -0.09829479455947876, 0.010830196551978588, -0.05873095616698265, -0.0321737602353096, -0.010260816663503647, -0.047368600964546204, 0.04204658418893814, 0.009711719118058681, -0.04026227071881294, 0.04448661580681801, 0.0010633632773533463, 0.04256034642457962, 0.041687242686748505, -0.0025316202081739902, 0.02852257713675499, 0.002603031462058425, -0.04672356694936752, -0.022562487050890923, -0.017681684345006943, -0.011620769277215004, -0.07063472270965576, 0.03322093188762665, 0.009530088864266872, ...]
>
Bumblebee.Utils.Nx.cosine_similarity(vec_a, vec_b)
#Nx.Tensor<
f32
EXLA.Backend<host:0, 0.2469518244.3383099408.7082>
0.7277914881706238
>
Plot ips / sequence length for Python:
chart =
Vl.new(width: 1280, height: 720)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "sequence_length", type: :quantitative)
|> Vl.encode_field(:y, "ips", type: :quantitative)
|> Kino.VegaLite.new()
RunOnButtonPress.run(:python_sequence_length, "Generate Graph", fn ->
for {text, sequence_length} <- Enum.zip(texts, sequence_lengths) do
IO.inspect(text)
%{ips: ips} =
ConcurrentBench.run(
fn ->
Req.post!("http://127.0.0.1:5001/result", body: text)
end,
32,
2_000
)
IO.inspect("sequence_length: #{sequence_length}, ips: #{ips}")
Kino.VegaLite.push(chart, %{ips: ips, sequence_length: sequence_length})
end
end)
nil
"Lorem"
"sequence_length: 4, ips: 252.60782634959224"
"Lorem ipsum"
"sequence_length: 6, ips: 255.32089239398564"
"Lorem ipsum dolor"
"sequence_length: 8, ips: 250.37231012183787"
"Lorem ipsum dolor sit"
"sequence_length: 9, ips: 245.69595126031535"
"Lorem ipsum dolor sit amet,"
"sequence_length: 12, ips: 227.75220560030687"
"Lorem ipsum dolor sit amet, consetetur"
"sequence_length: 16, ips: 220.8095517616056"
"Lorem ipsum dolor sit amet, consetetur sadipscing"
"sequence_length: 20, ips: 205.40222853921534"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr,"
"sequence_length: 23, ips: 212.8677027227578"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed"
"sequence_length: 25, ips: 189.87041344282528"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam"
"sequence_length: 27, ips: 181.40186159287828"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy"
"sequence_length: 30, ips: 199.78463216652452"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod"
"sequence_length: 34, ips: 189.84243078245055"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor"
"sequence_length: 36, ips: 194.01389817811472"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt"
"sequence_length: 39, ips: 187.35807625723515"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut"
"sequence_length: 40, ips: 185.86589775476992"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore"
"sequence_length: 42, ips: 175.35804766041892"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et"
"sequence_length: 43, ips: 182.3307059395351"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore"
"sequence_length: 45, ips: 180.61483885613927"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna"
"sequence_length: 46, ips: 179.82889280849272"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam"
"sequence_length: 49, ips: 174.3383883140329"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,"
"sequence_length: 52, ips: 175.33246982508211"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed"
"sequence_length: 54, ips: 174.19376735698643"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam"
"sequence_length: 56, ips: 172.90809934519805"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."
"sequence_length: 60, ips: 170.57841544622465"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At"
"sequence_length: 61, ips: 168.01778894572578"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero"
"sequence_length: 63, ips: 166.3693169015738"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos"
"sequence_length: 65, ips: 158.38050191130793"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et"
"sequence_length: 66, ips: 156.35130990428104"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam"
"sequence_length: 69, ips: 158.3029919265474"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et"
"sequence_length: 70, ips: 157.8636058445503"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo"
"sequence_length: 72, ips: 158.42656928513634"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo"
"sequence_length: 73, ips: 155.35272561611592"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores"
"sequence_length: 74, ips: 158.3642818104884"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et"
"sequence_length: 75, ips: 153.87543783307413"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea"
"sequence_length: 76, ips: 156.4069378719662"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum."
"sequence_length: 80, ips: 142.85878409192512"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet"
"sequence_length: 82, ips: 148.8350907194828"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita"
"sequence_length: 84, ips: 140.92425321389752"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd"
"sequence_length: 86, ips: 130.93283145746233"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren,"
"sequence_length: 90, ips: 137.86971312110057"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no"
"sequence_length: 91, ips: 135.70308165733374"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea"
"sequence_length: 92, ips: 130.38923434542357"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata"
"sequence_length: 95, ips: 119.37877085819349"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus"
"sequence_length: 97, ips: 123.932704541434"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est"
"sequence_length: 98, ips: 130.37444940522278"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem"
"sequence_length: 100, ips: 124.32451594574259"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum"
"sequence_length: 102, ips: 125.35991030023946"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor"
"sequence_length: 104, ips: 127.91321088641357"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit"
"sequence_length: 105, ips: 127.88950346900279"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."
"sequence_length: 108, ips: 129.4375463838698"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem"
"sequence_length: 110, ips: 123.88051724112094"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum"
"sequence_length: 112, ips: 122.34254514439915"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor"
"sequence_length: 114, ips: 122.45395731205068"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit"
"sequence_length: 115, ips: 123.94676486449069"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,"
"sequence_length: 118, ips: 117.42848605199433"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur"
"sequence_length: 122, ips: 113.8583601999113"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing"
"sequence_length: 126, ips: 117.37581638626334"
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr"
"sequence_length: 128, ips: 117.84385688962125"
Kino.terminate_child(pid)
:ok