EvanLyu732 · June 28, 2024 05:41
diff --git a/transformer.c b/transformer.c
 /* This file is written by chatgpt */

 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #define MAX_SEQUENCE_LENGTH 512 ///< Maximum sequence length for input data
 #define MAX_EMBEDDING_DIM 512   ///< Maximum embedding dimension for the model

 /**
 * Represents a matrix with floating point data.
 */
 typedef struct {
  float *data; ///< Pointer to the matrix data
  int rows;    ///< Number of rows in the matrix
  int cols;    ///< Number of columns in the matrix
 } Matrix;

 /**
 * Represents the weights for a single attention layer in the transformer model.
 */
 typedef struct {
  Matrix query_weight;  ///< Query weight matrix
  Matrix key_weight;    ///< Key weight matrix
  Matrix value_weight;  ///< Value weight matrix
  Matrix output_weight; ///< Output weight matrix
 } AttentionLayer;

 /**
 * Represents a single transformer block, which includes an attention layer and
 * a feed-forward network.
 */
 typedef struct {
  AttentionLayer attention;    ///< Attention layer
  Matrix feed_forward_weight1; ///< First weight matrix for the feed-forward
                               ///< network
  Matrix feed_forward_weight2; ///< Second weight matrix for the feed-forward
                               ///< network
  Matrix norm_gamma;           ///< Gamma parameter for layer normalization
  Matrix norm_beta;            ///< Beta parameter for layer normalization
 } TransformerBlock;

 /**
 * Represents the entire transformer model, consisting of multiple transformer
 * blocks.
 */
 typedef struct {
  Matrix embedding;         ///< Embedding matrix for input data
  TransformerBlock *blocks; ///< Pointer to an array of transformer blocks
  int num_blocks;           ///< Number of transformer blocks in the model
 } TransformerModel;

 /**
 * Creates a matrix with the specified number of rows and columns.
 * Allocates memory for the matrix data.
 *
 * @param rows Number of rows in the matrix
 * @param cols Number of columns in the matrix
 * @return The created matrix
 */
 Matrix create_matrix(int rows, int cols) {
  Matrix mat;
  mat.rows = rows;
  mat.cols = cols;
  mat.data = (float *)calloc(
      rows * cols, sizeof(float)); // Use calloc for zero-initialization
  return mat;
 }

 /**
 * Frees the memory allocated for a matrix.
 *
 * @param mat Pointer to the matrix to be freed
 */
 void free_matrix(Matrix *mat) {
  free(mat->data);
  mat->data = NULL;
  mat->rows = 0;
  mat->cols = 0;
 }

 /**
 * Multiplies two matrices and stores the result in a third matrix.
 *
 * @param a Pointer to the first matrix
 * @param b Pointer to the second matrix
 * @param result Pointer to the matrix where the result will be stored
 */
 void matmul(Matrix *a, Matrix *b, Matrix *result) {
  if (a->cols != b->rows)
    return;

  for (int i = 0; i < a->rows; i++) {
    for (int j = 0; j < b->cols; j++) {
      float sum = 0;
      for (int k = 0; k < a->cols; k++) {
        sum += a->data[i * a->cols + k] * b->data[k * b->cols + j];
      }
      result->data[i * result->cols + j] = sum;
    }
  }
 }

 /**
 * Adds two matrices and stores the result in a third matrix.
 *
 * @param a Pointer to the first matrix
 * @param b Pointer to the second matrix
 * @param result Pointer to the matrix where the result will be stored
 */
 void add_matrix(Matrix *a, Matrix *b, Matrix *result) {
  int size = a->rows * a->cols;
  for (int i = 0; i < size; i++) {
    result->data[i] = a->data[i] + b->data[i];
  }
 }

 /**
 * Applies positional encoding to an embedding matrix.
 *
 * @param embedding Pointer to the embedding matrix
 * @param max_len Maximum length of the sequences
 * @param d_model Dimension of the model
 */
 void positional_encoding(Matrix *embedding, int max_len, int d_model) {
  for (int pos = 0; pos < max_len; pos++) {
    for (int i = 0; i < d_model; i += 2) {
      embedding->data[pos * d_model + i] =
          sin(pos / pow(10000, (float)i / d_model));
      if (i + 1 < d_model) {
        embedding->data[pos * d_model + i + 1] =
            cos(pos / pow(10000, (float)(i + 1) / d_model));
      }
    }
  }
 }

 /**
 * Performs self-attention on the input matrix and stores the result in the
 * output matrix.
 *
 * @param input Pointer to the input matrix
 * @param layer Pointer to the attention layer containing the weights
 * @param output Pointer to the matrix where the result will be stored
 */
 void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output) {
  Matrix q = create_matrix(input->rows, layer->query_weight.cols);
  Matrix k = create_matrix(input->rows, layer->key_weight.cols);
  Matrix v = create_matrix(input->rows, layer->value_weight.cols);
  Matrix temp = create_matrix(input->rows, input->rows);

  matmul(input, &layer->query_weight, &q);
  matmul(input, &layer->key_weight, &k);
  matmul(input, &layer->value_weight, &v);

  matmul(&q, &k, &temp);
  for (int i = 0; i < temp.rows * temp.cols; i++) {
    temp.data[i] /= sqrt(k.cols);
  }

  for (int i = 0; i < temp.rows; i++) {
    float sum = 0.0;
    for (int j = 0; j < temp.cols; j++) {
      temp.data[i * temp.cols + j] = exp(temp.data[i * temp.cols + j]);
      sum += temp.data[i * temp.cols + j];
    }
    for (int j = 0; j < temp.cols; j++) {
      temp.data[i * temp.cols + j] /= sum;
    }
  }

  matmul(&temp, &v, output);

  free_matrix(&q);
  free_matrix(&k);
  free_matrix(&v);
  free_matrix(&temp);
 }

 /**
 * Applies a feed-forward neural network to the input matrix and stores the
 * result in the output matrix.
 *
 * @param input Pointer to the input matrix
 * @param weight1 Pointer to the first weight matrix of the feed-forward network
 * @param weight2 Pointer to the second weight matrix of the feed-forward
 * network
 * @param output Pointer to the matrix where the result will be stored
 */
 void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2,
                  Matrix *output) {
  Matrix temp = create_matrix(input->rows, weight1->cols);
  matmul(input, weight1, &temp);

  for (int i = 0; i < temp.rows * temp.cols; i++) {
    temp.data[i] = fmax(0, temp.data[i]); // ReLU activation
  }

  matmul(&temp, weight2, output);
  free_matrix(&temp);
 }

 /**
 * Processes an input matrix through a transformer block and stores the result
 * in the output matrix.
 *
 * @param input Pointer to the input matrix
 * @param block Pointer to the transformer block containing the layer weights
 * @param output Pointer to the matrix where the result will be stored
 */
 void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output) {
  Matrix attn_output = create_matrix(input->rows, input->cols);
  Matrix norm_output = create_matrix(input->rows, input->cols);
  Matrix ff_output =
      create_matrix(input->rows, block->feed_forward_weight2.cols);

  self_attention(input, &block->attention, &attn_output);

  add_matrix(input, &attn_output, &norm_output);

  feed_forward(&norm_output, &block->feed_forward_weight1,
               &block->feed_forward_weight2, &ff_output);

  add_matrix(&norm_output, &ff_output, output);

  free_matrix(&attn_output);
  free_matrix(&norm_output);
  free_matrix(&ff_output);
 }

 /**
 * Processes an input matrix through the entire transformer model and stores the
 * result in the output matrix.
 *
 * @param input Pointer to the input matrix
 * @param model Pointer to the transformer model
 * @param output Pointer to the matrix where the result will be stored
 */
 void transformer_forward(Matrix *input, TransformerModel *model,
                         Matrix *output) {
  Matrix temp_input = create_matrix(input->rows, input->cols);
  memcpy(temp_input.data, input->data,
         input->rows * input->cols * sizeof(float));

  for (int i = 0; i < model->num_blocks; i++) {
    Matrix temp_output = create_matrix(temp_input.rows, temp_input.cols);
    transformer_block(&temp_input, &model->blocks[i], &temp_output);
    free_matrix(&temp_input);
    temp_input = temp_output;
  }

  memcpy(output->data, temp_input.data,
         temp_input.rows * temp_input.cols * sizeof(float));
  free_matrix(&temp_input);
 }

 int main() {
  // Example usage
  int num_blocks = 2;
  int embedding_dim = 8;
  int seq_length = 4;

  // Initialize the transformer model
  TransformerModel model;
  model.num_blocks = num_blocks;
  model.blocks =
      (TransformerBlock *)malloc(num_blocks * sizeof(TransformerBlock));

  // Allocate memory for each transformer block
  for (int i = 0; i < num_blocks; i++) {
    model.blocks[i].attention.query_weight =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].attention.key_weight =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].attention.value_weight =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].attention.output_weight =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].feed_forward_weight1 =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].feed_forward_weight2 =
        create_matrix(embedding_dim, embedding_dim);
    model.blocks[i].norm_gamma = create_matrix(1, embedding_dim);
    model.blocks[i].norm_beta = create_matrix(1, embedding_dim);
  }

  // Create input and output matrices
  Matrix input = create_matrix(seq_length, embedding_dim);
  Matrix output = create_matrix(seq_length, embedding_dim);

  // Initialize input with some values
  for (int i = 0; i < input.rows * input.cols; i++) {
    input.data[i] = (float)i / 10.0;
  }

  // Apply the transformer model
  transformer_forward(&input, &model, &output);

  // Print the output
  printf("Output Matrix:\n");
  for (int i = 0; i < output.rows; i++) {
    for (int j = 0; j < output.cols; j++) {
      printf("%f ", output.data[i * output.cols + j]);
    }
    printf("\n");
  }

  // Free memory
  free_matrix(&input);
  free_matrix(&output);
  for (int i = 0; i < num_blocks; i++) {
    free_matrix(&model.blocks[i].attention.query_weight);
    free_matrix(&model.blocks[i].attention.key_weight);
    free_matrix(&model.blocks[i].attention.value_weight);
    free_matrix(&model.blocks[i].attention.output_weight);
    free_matrix(&model.blocks[i].feed_forward_weight1);
    free_matrix(&model.blocks[i].feed_forward_weight2);
    free_matrix(&model.blocks[i].norm_gamma);
    free_matrix(&model.blocks[i].norm_beta);
  }
  free(model.blocks);

  return 0;
 }
diff --git a/transformer.c.plantuml b/transformer.c.plantuml
 @startuml

 class Matrix {
    +float *data
    +int rows
    +int cols
 }

 class AttentionLayer {
    +Matrix query_weight
    +Matrix key_weight
    +Matrix value_weight
    +Matrix output_weight
 }

 class TransformerBlock {
    +AttentionLayer attention
    +Matrix feed_forward_weight1
    +Matrix feed_forward_weight2
    +Matrix norm_gamma
    +Matrix norm_beta
 }

 class TransformerModel {
    +Matrix embedding
    +TransformerBlock *blocks
    +int num_blocks
 }

 class Main {
    +int main()
 }

 class "Matrix Operations" {
    +Matrix create_matrix(int rows, int cols)
    +void free_matrix(Matrix *mat)
    +void matmul(Matrix *a, Matrix *b, Matrix *result)
    +void add_matrix(Matrix *a, Matrix *b, Matrix *result)
    +void positional_encoding(Matrix *embedding, int max_len, int d_model)
 }

 class "Transformer Operations" {
    +void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output)
    +void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2, Matrix *output)
    +void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output)
    +void transformer_forward(Matrix *input, TransformerModel *model, Matrix *output)
 }

 Main -right-> TransformerModel
 TransformerModel -right-> TransformerBlock
 TransformerBlock -right-> AttentionLayer
 TransformerModel *-- Matrix
 AttentionLayer *-- Matrix
 TransformerBlock *-- Matrix
 "Matrix Operations" ..> Matrix : uses
 "Transformer Operations" ..> Matrix : uses
 "Transformer Operations" ..> AttentionLayer : uses
 "Transformer Operations" ..> TransformerBlock : uses
 "Transformer Operations" ..> TransformerModel : uses

 @enduml
	/* This file is written by chatgpt */

	#include <math.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#define MAX_SEQUENCE_LENGTH 512 ///< Maximum sequence length for input data
	#define MAX_EMBEDDING_DIM 512 ///< Maximum embedding dimension for the model

	/**
	* Represents a matrix with floating point data.
	*/
	typedef struct {
	float *data; ///< Pointer to the matrix data
	int rows; ///< Number of rows in the matrix
	int cols; ///< Number of columns in the matrix
	} Matrix;

	/**
	* Represents the weights for a single attention layer in the transformer model.
	*/
	typedef struct {
	Matrix query_weight; ///< Query weight matrix
	Matrix key_weight; ///< Key weight matrix
	Matrix value_weight; ///< Value weight matrix
	Matrix output_weight; ///< Output weight matrix
	} AttentionLayer;

	/**
	* Represents a single transformer block, which includes an attention layer and
	* a feed-forward network.
	*/
	typedef struct {
	AttentionLayer attention; ///< Attention layer
	Matrix feed_forward_weight1; ///< First weight matrix for the feed-forward
	///< network
	Matrix feed_forward_weight2; ///< Second weight matrix for the feed-forward
	///< network
	Matrix norm_gamma; ///< Gamma parameter for layer normalization
	Matrix norm_beta; ///< Beta parameter for layer normalization
	} TransformerBlock;

	/**
	* Represents the entire transformer model, consisting of multiple transformer
	* blocks.
	*/
	typedef struct {
	Matrix embedding; ///< Embedding matrix for input data
	TransformerBlock *blocks; ///< Pointer to an array of transformer blocks
	int num_blocks; ///< Number of transformer blocks in the model
	} TransformerModel;

	/**
	* Creates a matrix with the specified number of rows and columns.
	* Allocates memory for the matrix data.
	*
	* @param rows Number of rows in the matrix
	* @param cols Number of columns in the matrix
	* @return The created matrix
	*/
	Matrix create_matrix(int rows, int cols) {
	Matrix mat;
	mat.rows = rows;
	mat.cols = cols;
	mat.data = (float *)calloc(
	rows * cols, sizeof(float)); // Use calloc for zero-initialization
	return mat;
	}

	/**
	* Frees the memory allocated for a matrix.
	*
	* @param mat Pointer to the matrix to be freed
	*/
	void free_matrix(Matrix *mat) {
	free(mat->data);
	mat->data = NULL;
	mat->rows = 0;
	mat->cols = 0;
	}

	/**
	* Multiplies two matrices and stores the result in a third matrix.
	*
	* @param a Pointer to the first matrix
	* @param b Pointer to the second matrix
	* @param result Pointer to the matrix where the result will be stored
	*/
	void matmul(Matrix a, Matrix b, Matrix *result) {
	if (a->cols != b->rows)
	return;

	for (int i = 0; i < a->rows; i++) {
	for (int j = 0; j < b->cols; j++) {
	float sum = 0;
	for (int k = 0; k < a->cols; k++) {
	sum += a->data[i * a->cols + k] * b->data[k * b->cols + j];
	}
	result->data[i * result->cols + j] = sum;
	}
	}
	}

	/**
	* Adds two matrices and stores the result in a third matrix.
	*
	* @param a Pointer to the first matrix
	* @param b Pointer to the second matrix
	* @param result Pointer to the matrix where the result will be stored
	*/
	void add_matrix(Matrix a, Matrix b, Matrix *result) {
	int size = a->rows * a->cols;
	for (int i = 0; i < size; i++) {
	result->data[i] = a->data[i] + b->data[i];
	}
	}

	/**
	* Applies positional encoding to an embedding matrix.
	*
	* @param embedding Pointer to the embedding matrix
	* @param max_len Maximum length of the sequences
	* @param d_model Dimension of the model
	*/
	void positional_encoding(Matrix *embedding, int max_len, int d_model) {
	for (int pos = 0; pos < max_len; pos++) {
	for (int i = 0; i < d_model; i += 2) {
	embedding->data[pos * d_model + i] =
	sin(pos / pow(10000, (float)i / d_model));
	if (i + 1 < d_model) {
	embedding->data[pos * d_model + i + 1] =
	cos(pos / pow(10000, (float)(i + 1) / d_model));
	}
	}
	}
	}

	/**
	* Performs self-attention on the input matrix and stores the result in the
	* output matrix.
	*
	* @param input Pointer to the input matrix
	* @param layer Pointer to the attention layer containing the weights
	* @param output Pointer to the matrix where the result will be stored
	*/
	void self_attention(Matrix input, AttentionLayer layer, Matrix *output) {
	Matrix q = create_matrix(input->rows, layer->query_weight.cols);
	Matrix k = create_matrix(input->rows, layer->key_weight.cols);
	Matrix v = create_matrix(input->rows, layer->value_weight.cols);
	Matrix temp = create_matrix(input->rows, input->rows);

	matmul(input, &layer->query_weight, &q);
	matmul(input, &layer->key_weight, &k);
	matmul(input, &layer->value_weight, &v);

	matmul(&q, &k, &temp);
	for (int i = 0; i < temp.rows * temp.cols; i++) {
	temp.data[i] /= sqrt(k.cols);
	}

	for (int i = 0; i < temp.rows; i++) {
	float sum = 0.0;
	for (int j = 0; j < temp.cols; j++) {
	temp.data[i * temp.cols + j] = exp(temp.data[i * temp.cols + j]);
	sum += temp.data[i * temp.cols + j];
	}
	for (int j = 0; j < temp.cols; j++) {
	temp.data[i * temp.cols + j] /= sum;
	}
	}

	matmul(&temp, &v, output);

	free_matrix(&q);
	free_matrix(&k);
	free_matrix(&v);
	free_matrix(&temp);
	}

	/**
	* Applies a feed-forward neural network to the input matrix and stores the
	* result in the output matrix.
	*
	* @param input Pointer to the input matrix
	* @param weight1 Pointer to the first weight matrix of the feed-forward network
	* @param weight2 Pointer to the second weight matrix of the feed-forward
	* network
	* @param output Pointer to the matrix where the result will be stored
	*/
	void feed_forward(Matrix input, Matrix weight1, Matrix *weight2,
	Matrix *output) {
	Matrix temp = create_matrix(input->rows, weight1->cols);
	matmul(input, weight1, &temp);

	for (int i = 0; i < temp.rows * temp.cols; i++) {
	temp.data[i] = fmax(0, temp.data[i]); // ReLU activation
	}

	matmul(&temp, weight2, output);
	free_matrix(&temp);
	}

	/**
	* Processes an input matrix through a transformer block and stores the result
	* in the output matrix.
	*
	* @param input Pointer to the input matrix
	* @param block Pointer to the transformer block containing the layer weights
	* @param output Pointer to the matrix where the result will be stored
	*/
	void transformer_block(Matrix input, TransformerBlock block, Matrix *output) {
	Matrix attn_output = create_matrix(input->rows, input->cols);
	Matrix norm_output = create_matrix(input->rows, input->cols);
	Matrix ff_output =
	create_matrix(input->rows, block->feed_forward_weight2.cols);

	self_attention(input, &block->attention, &attn_output);

	add_matrix(input, &attn_output, &norm_output);

	feed_forward(&norm_output, &block->feed_forward_weight1,
	&block->feed_forward_weight2, &ff_output);

	add_matrix(&norm_output, &ff_output, output);

	free_matrix(&attn_output);
	free_matrix(&norm_output);
	free_matrix(&ff_output);
	}

	/**
	* Processes an input matrix through the entire transformer model and stores the
	* result in the output matrix.
	*
	* @param input Pointer to the input matrix
	* @param model Pointer to the transformer model
	* @param output Pointer to the matrix where the result will be stored
	*/
	void transformer_forward(Matrix input, TransformerModel model,
	Matrix *output) {
	Matrix temp_input = create_matrix(input->rows, input->cols);
	memcpy(temp_input.data, input->data,
	input->rows * input->cols * sizeof(float));

	for (int i = 0; i < model->num_blocks; i++) {
	Matrix temp_output = create_matrix(temp_input.rows, temp_input.cols);
	transformer_block(&temp_input, &model->blocks[i], &temp_output);
	free_matrix(&temp_input);
	temp_input = temp_output;
	}

	memcpy(output->data, temp_input.data,
	temp_input.rows * temp_input.cols * sizeof(float));
	free_matrix(&temp_input);
	}

	int main() {
	// Example usage
	int num_blocks = 2;
	int embedding_dim = 8;
	int seq_length = 4;

	// Initialize the transformer model
	TransformerModel model;
	model.num_blocks = num_blocks;
	model.blocks =
	(TransformerBlock )malloc(num_blocks sizeof(TransformerBlock));

	// Allocate memory for each transformer block
	for (int i = 0; i < num_blocks; i++) {
	model.blocks[i].attention.query_weight =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].attention.key_weight =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].attention.value_weight =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].attention.output_weight =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].feed_forward_weight1 =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].feed_forward_weight2 =
	create_matrix(embedding_dim, embedding_dim);
	model.blocks[i].norm_gamma = create_matrix(1, embedding_dim);
	model.blocks[i].norm_beta = create_matrix(1, embedding_dim);
	}

	// Create input and output matrices
	Matrix input = create_matrix(seq_length, embedding_dim);
	Matrix output = create_matrix(seq_length, embedding_dim);

	// Initialize input with some values
	for (int i = 0; i < input.rows * input.cols; i++) {
	input.data[i] = (float)i / 10.0;
	}

	// Apply the transformer model
	transformer_forward(&input, &model, &output);

	// Print the output
	printf("Output Matrix:\n");
	for (int i = 0; i < output.rows; i++) {
	for (int j = 0; j < output.cols; j++) {
	printf("%f ", output.data[i * output.cols + j]);
	}
	printf("\n");
	}

	// Free memory
	free_matrix(&input);
	free_matrix(&output);
	for (int i = 0; i < num_blocks; i++) {
	free_matrix(&model.blocks[i].attention.query_weight);
	free_matrix(&model.blocks[i].attention.key_weight);
	free_matrix(&model.blocks[i].attention.value_weight);
	free_matrix(&model.blocks[i].attention.output_weight);
	free_matrix(&model.blocks[i].feed_forward_weight1);
	free_matrix(&model.blocks[i].feed_forward_weight2);
	free_matrix(&model.blocks[i].norm_gamma);
	free_matrix(&model.blocks[i].norm_beta);
	}
	free(model.blocks);

	return 0;
	}
	@startuml

	class Matrix {
	+float *data
	+int rows
	+int cols
	}

	class AttentionLayer {
	+Matrix query_weight
	+Matrix key_weight
	+Matrix value_weight
	+Matrix output_weight
	}

	class TransformerBlock {
	+AttentionLayer attention
	+Matrix feed_forward_weight1
	+Matrix feed_forward_weight2
	+Matrix norm_gamma
	+Matrix norm_beta
	}

	class TransformerModel {
	+Matrix embedding
	+TransformerBlock *blocks
	+int num_blocks
	}

	class Main {
	+int main()
	}

	class "Matrix Operations" {
	+Matrix create_matrix(int rows, int cols)
	+void free_matrix(Matrix *mat)
	+void matmul(Matrix a, Matrix b, Matrix *result)
	+void add_matrix(Matrix a, Matrix b, Matrix *result)
	+void positional_encoding(Matrix *embedding, int max_len, int d_model)
	}

	class "Transformer Operations" {
	+void self_attention(Matrix input, AttentionLayer layer, Matrix *output)
	+void feed_forward(Matrix input, Matrix weight1, Matrix weight2, Matrix output)
	+void transformer_block(Matrix input, TransformerBlock block, Matrix *output)
	+void transformer_forward(Matrix input, TransformerModel model, Matrix *output)
	}

	Main -right-> TransformerModel
	TransformerModel -right-> TransformerBlock
	TransformerBlock -right-> AttentionLayer
	TransformerModel *-- Matrix
	AttentionLayer *-- Matrix
	TransformerBlock *-- Matrix
	"Matrix Operations" ..> Matrix : uses
	"Transformer Operations" ..> Matrix : uses
	"Transformer Operations" ..> AttentionLayer : uses
	"Transformer Operations" ..> TransformerBlock : uses
	"Transformer Operations" ..> TransformerModel : uses

	@enduml