Created June 28, 2024 05:41
transformer in pure c written by chatgpt.
/* This file is written by chatgpt */
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_SEQUENCE_LENGTH 512 ///< Maximum sequence length for input data
#define MAX_EMBEDDING_DIM 512 ///< Maximum embedding dimension for the model
* Represents a matrix with floating point data.
typedef struct {
float *data; ///< Pointer to the matrix data
int rows; ///< Number of rows in the matrix
int cols; ///< Number of columns in the matrix
} Matrix;
* Represents the weights for a single attention layer in the transformer model.
typedef struct {
Matrix query_weight; ///< Query weight matrix
Matrix key_weight; ///< Key weight matrix
Matrix value_weight; ///< Value weight matrix
Matrix output_weight; ///< Output weight matrix
} AttentionLayer;
* Represents a single transformer block, which includes an attention layer and
* a feed-forward network.
typedef struct {
AttentionLayer attention; ///< Attention layer
Matrix feed_forward_weight1; ///< First weight matrix for the feed-forward
///< network
Matrix feed_forward_weight2; ///< Second weight matrix for the feed-forward
///< network
Matrix norm_gamma; ///< Gamma parameter for layer normalization
Matrix norm_beta; ///< Beta parameter for layer normalization
} TransformerBlock;
* Represents the entire transformer model, consisting of multiple transformer
* blocks.
typedef struct {
Matrix embedding; ///< Embedding matrix for input data
TransformerBlock *blocks; ///< Pointer to an array of transformer blocks
int num_blocks; ///< Number of transformer blocks in the model
} TransformerModel;
* Creates a matrix with the specified number of rows and columns.
* Allocates memory for the matrix data.
* @param rows Number of rows in the matrix
* @param cols Number of columns in the matrix
* @return The created matrix
Matrix create_matrix(int rows, int cols) {
Matrix mat;
mat.rows = rows;
mat.cols = cols; = (float *)calloc(
rows * cols, sizeof(float)); // Use calloc for zero-initialization
return mat;
* Frees the memory allocated for a matrix.
* @param mat Pointer to the matrix to be freed
void free_matrix(Matrix *mat) {
mat->data = NULL;
mat->rows = 0;
mat->cols = 0;
* Multiplies two matrices and stores the result in a third matrix.
* @param a Pointer to the first matrix
* @param b Pointer to the second matrix
* @param result Pointer to the matrix where the result will be stored
void matmul(Matrix *a, Matrix *b, Matrix *result) {
if (a->cols != b->rows)
for (int i = 0; i < a->rows; i++) {
for (int j = 0; j < b->cols; j++) {
float sum = 0;
for (int k = 0; k < a->cols; k++) {
sum += a->data[i * a->cols + k] * b->data[k * b->cols + j];
result->data[i * result->cols + j] = sum;
* Adds two matrices and stores the result in a third matrix.
* @param a Pointer to the first matrix
* @param b Pointer to the second matrix
* @param result Pointer to the matrix where the result will be stored
void add_matrix(Matrix *a, Matrix *b, Matrix *result) {
int size = a->rows * a->cols;
for (int i = 0; i < size; i++) {
result->data[i] = a->data[i] + b->data[i];
* Applies positional encoding to an embedding matrix.
* @param embedding Pointer to the embedding matrix
* @param max_len Maximum length of the sequences
* @param d_model Dimension of the model
void positional_encoding(Matrix *embedding, int max_len, int d_model) {
for (int pos = 0; pos < max_len; pos++) {
for (int i = 0; i < d_model; i += 2) {
embedding->data[pos * d_model + i] =
sin(pos / pow(10000, (float)i / d_model));
if (i + 1 < d_model) {
embedding->data[pos * d_model + i + 1] =
cos(pos / pow(10000, (float)(i + 1) / d_model));
* Performs self-attention on the input matrix and stores the result in the
* output matrix.
* @param input Pointer to the input matrix
* @param layer Pointer to the attention layer containing the weights
* @param output Pointer to the matrix where the result will be stored
void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output) {
Matrix q = create_matrix(input->rows, layer->query_weight.cols);
Matrix k = create_matrix(input->rows, layer->key_weight.cols);
Matrix v = create_matrix(input->rows, layer->value_weight.cols);
Matrix temp = create_matrix(input->rows, input->rows);
matmul(input, &layer->query_weight, &q);
matmul(input, &layer->key_weight, &k);
matmul(input, &layer->value_weight, &v);
matmul(&q, &k, &temp);
for (int i = 0; i < temp.rows * temp.cols; i++) {[i] /= sqrt(k.cols);
for (int i = 0; i < temp.rows; i++) {
float sum = 0.0;
for (int j = 0; j < temp.cols; j++) {[i * temp.cols + j] = exp([i * temp.cols + j]);
sum +=[i * temp.cols + j];
for (int j = 0; j < temp.cols; j++) {[i * temp.cols + j] /= sum;
matmul(&temp, &v, output);
* Applies a feed-forward neural network to the input matrix and stores the
* result in the output matrix.
* @param input Pointer to the input matrix
* @param weight1 Pointer to the first weight matrix of the feed-forward network
* @param weight2 Pointer to the second weight matrix of the feed-forward
* network
* @param output Pointer to the matrix where the result will be stored
void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2,
Matrix *output) {
Matrix temp = create_matrix(input->rows, weight1->cols);
matmul(input, weight1, &temp);
for (int i = 0; i < temp.rows * temp.cols; i++) {[i] = fmax(0,[i]); // ReLU activation
matmul(&temp, weight2, output);
* Processes an input matrix through a transformer block and stores the result
* in the output matrix.
* @param input Pointer to the input matrix
* @param block Pointer to the transformer block containing the layer weights
* @param output Pointer to the matrix where the result will be stored
void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output) {
Matrix attn_output = create_matrix(input->rows, input->cols);
Matrix norm_output = create_matrix(input->rows, input->cols);
Matrix ff_output =
create_matrix(input->rows, block->feed_forward_weight2.cols);
self_attention(input, &block->attention, &attn_output);
add_matrix(input, &attn_output, &norm_output);
feed_forward(&norm_output, &block->feed_forward_weight1,
&block->feed_forward_weight2, &ff_output);
add_matrix(&norm_output, &ff_output, output);
* Processes an input matrix through the entire transformer model and stores the
* result in the output matrix.
* @param input Pointer to the input matrix
* @param model Pointer to the transformer model
* @param output Pointer to the matrix where the result will be stored
void transformer_forward(Matrix *input, TransformerModel *model,
Matrix *output) {
Matrix temp_input = create_matrix(input->rows, input->cols);
memcpy(, input->data,
input->rows * input->cols * sizeof(float));
for (int i = 0; i < model->num_blocks; i++) {
Matrix temp_output = create_matrix(temp_input.rows, temp_input.cols);
transformer_block(&temp_input, &model->blocks[i], &temp_output);
temp_input = temp_output;
temp_input.rows * temp_input.cols * sizeof(float));
int main() {
// Example usage
int num_blocks = 2;
int embedding_dim = 8;
int seq_length = 4;
// Initialize the transformer model
TransformerModel model;
model.num_blocks = num_blocks;
model.blocks =
(TransformerBlock *)malloc(num_blocks * sizeof(TransformerBlock));
// Allocate memory for each transformer block
for (int i = 0; i < num_blocks; i++) {
model.blocks[i].attention.query_weight =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].attention.key_weight =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].attention.value_weight =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].attention.output_weight =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].feed_forward_weight1 =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].feed_forward_weight2 =
create_matrix(embedding_dim, embedding_dim);
model.blocks[i].norm_gamma = create_matrix(1, embedding_dim);
model.blocks[i].norm_beta = create_matrix(1, embedding_dim);
// Create input and output matrices
Matrix input = create_matrix(seq_length, embedding_dim);
Matrix output = create_matrix(seq_length, embedding_dim);
// Initialize input with some values
for (int i = 0; i < input.rows * input.cols; i++) {[i] = (float)i / 10.0;
// Apply the transformer model
transformer_forward(&input, &model, &output);
// Print the output
printf("Output Matrix:\n");
for (int i = 0; i < output.rows; i++) {
for (int j = 0; j < output.cols; j++) {
printf("%f ",[i * output.cols + j]);
// Free memory
for (int i = 0; i < num_blocks; i++) {
return 0;
class Matrix {
+float *data
+int rows
+int cols
class AttentionLayer {
+Matrix query_weight
+Matrix key_weight
+Matrix value_weight
+Matrix output_weight
class TransformerBlock {
+AttentionLayer attention
+Matrix feed_forward_weight1
+Matrix feed_forward_weight2
+Matrix norm_gamma
+Matrix norm_beta
class TransformerModel {
+Matrix embedding
+TransformerBlock *blocks
+int num_blocks
class Main {
+int main()
class "Matrix Operations" {
+Matrix create_matrix(int rows, int cols)
+void free_matrix(Matrix *mat)
+void matmul(Matrix *a, Matrix *b, Matrix *result)
+void add_matrix(Matrix *a, Matrix *b, Matrix *result)
+void positional_encoding(Matrix *embedding, int max_len, int d_model)
class "Transformer Operations" {
+void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output)
+void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2, Matrix *output)
+void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output)
+void transformer_forward(Matrix *input, TransformerModel *model, Matrix *output)
Main -right-> TransformerModel
TransformerModel -right-> TransformerBlock
TransformerBlock -right-> AttentionLayer
TransformerModel *-- Matrix
AttentionLayer *-- Matrix
TransformerBlock *-- Matrix
"Matrix Operations" ..> Matrix : uses
"Transformer Operations" ..> Matrix : uses
"Transformer Operations" ..> AttentionLayer : uses
"Transformer Operations" ..> TransformerBlock : uses
"Transformer Operations" ..> TransformerModel : uses
