flexchar · May 12, 2024 07:53
diff --git a/ParseScreenshotViaGoogle.php b/ParseScreenshotViaGoogle.php
 <?php

 namespace App\Jobs\OCR;

 use App\Models\Media;
 use App\Models\Message;
 use App\Services\Helpers;
 use Laravel\Nova\Makeable;
 use Illuminate\Support\Str;
 use App\Models\Conversation;
 use Illuminate\Bus\Batchable;
 use Illuminate\Bus\Queueable;
 use App\Casts\Message\ExtraData;
 use Google\Cloud\Vision\V1\Page;
 use Illuminate\Support\Collection;
 use Illuminate\Support\Facades\Cache;
 use App\Types\Fluent\GoogleOcrMessage;
 use Illuminate\Queue\SerializesModels;
 use Illuminate\Queue\InteractsWithQueue;
 use App\Types\Message\Type as MessageType;
 use Illuminate\Contracts\Queue\ShouldQueue;
 use Illuminate\Foundation\Bus\Dispatchable;
 use App\Types\Message\Source as MessageSource;
 use Illuminate\Contracts\Queue\ShouldBeUnique;
 use Google\Cloud\Vision\V1\ImageAnnotatorClient;
 use Google\Cloud\Vision\V1\AnnotateImageResponse;

 /**
 * Extract text from a screenshot using Google Cloud Vision OCR.
 *
 * @deprecated Use `ParseMediaViaAI` instead.
 */
 class ParseScreenshotViaGoogle implements ShouldBeUnique, ShouldQueue
 {
    use Batchable,
        Dispatchable,
        InteractsWithQueue,
        Makeable,
        Queueable,
        SerializesModels;

    public $tries = 3;

    public ImageAnnotatorClient $client;

    public AnnotateImageResponse $annotation;

    public function uniqueId()
    {
        return $this->media->getKey();
    }

    public function __construct(public Conversation $convo, public Media $media)
    {
    }

    // https://cloud.google.com/vision/docs/ocr#detect_text_in_a_local_image

    public function handle(ImageAnnotatorClient $client): void
    {
        if ($this->batch()?->cancelled()) {
            return;
        }

        $this->client = $client;

        $page = $this->mediaToPage();

        Cache::put(
            key: 'vision:' . $this->media->getKey(),
            value: $page->serializeToJsonString(),
            ttl: 3600 * 24 * 7,
        );

        $pWidth = $page->getWidth();
        $pHeight = $page->getHeight();

        // Save image height and width
        $this->media->update([
            'custom_properties->width' => $pWidth,
            'custom_properties->height' => $pHeight,
        ]);

        $toSave = $this->pageToMessageClass($page);

        // dd(
        //     $toSave
        //         //
        //         ->toArray(),
        // );

        $this->deletePreviousMessages();
        $this->convo->messages()->saveMany($toSave);
    }

    public function deletePreviousMessages(): void
    {
        $this->media
            ->messages()
            ->where('source', MessageSource::OCR)
            ->withTrashed()
            ->get()
            ->each->forceDelete();
    }

    public function mediaToAnnotation(): self
    {
        // $cacheKey = 'vision:ocr:' . $this->media->getKey();
        // if (app()->isLocal() && cache()->has($cacheKey)) {
        //     return cache()->get($cacheKey);
        // }

        $media = $this->media;
        $url = $media->getTemporaryUrl(
            expiration: now()->addMinutes(5),
            // Food for thought. I could also take raw image, run redact and then OCR.
            // conversionName: $media->hasGeneratedConversion('eco') ? 'eco' : '',
        );

        try {
            $annotation = $this->client->documentTextDetection($url);
        } finally {
            $this->client->close();
        }

        // if (app()->isLocal()) {
        //     cache()->put($cacheKey, $annotation, $devCacheTTLHours * 3600);
        // }

        $this->annotation = $annotation;

        return $this;
    }

    public function imageToAnnotation($imageContent, $cacheTTL = false): self
    {
        $cacheKey = 'google:ocr:' . hash('murmur3f', $imageContent);

        if ($cacheTTL !== false && app()->isLocal() && cache()->has($cacheKey)) {
            $this->annotation = cache()->get($cacheKey);

            return $this;
        }

        try {
            $annotation = $this->client->documentTextDetection($imageContent);
        } finally {
            $this->client->close();
        }

        $this->annotation = $annotation;

        if ($cacheTTL !== false && app()->isLocal()) {
            cache()->put($cacheKey, $annotation, $cacheTTL);
        }

        return $this;
    }

    public function mediaToPage(): Page
    {
        $textAnnotation = $this->annotation->getFullTextAnnotation();

        if (!$textAnnotation) {
            throw new \Error('OCR returned no pages for this image.');
        }

        /** @var Page[] $pages */
        $pages = $textAnnotation->getPages();

        return $pages[0];
    }

    /**
     * @return Collection<int, GoogleOcrMessage>
     */
    public static function toMessages(Page $page): Collection
    {
        $json = json_decode($page->serializeToJsonString());
        // dd($page->getText(), $json);

        $pageWidth = (int) $json->width;
        $pageLang = (string) data_get($json, 'property.detectedLanguages.0.languageCode');
        // $page->getProperty()->getDetectedLanguages()[0]->getLanguageCode();

        // @phpstan-ignore-next-line
        return collect($json->blocks)
            ->flatten(0)
            ->pluck('paragraphs')
            ->flatten(0)
            ->map(
                fn($paragraph) => static::processParagraph(
                    $paragraph,
                    $pageWidth,
                    $pageLang,
                ),
            )
            // Order messages by their y position.
            ->sortBy('bounds.top')
            ->values();
    }

    /**
     * @property \Google\Cloud\Vision\V1\Paragraph $paragraph
     */
    public static function processParagraph(
        object $paragraph,
        int $pageWidth,
        string $pageLang,
    ): GoogleOcrMessage {
        /** @var \Google\Cloud\Vision\V1\Vertex $bounds */
        $bounds = $paragraph->boundingBox->vertices;

        $allLangs = data_get(
            $paragraph,
            'words.*.property.detectedLanguages.0.languageCode',
        );
        $lang = collect($allLangs)->unique()->filter()->first();

        $text = collect($paragraph->words)
            ->pluck('symbols')
            ->map(function ($symbols) {
                // Detect the distance between the characters.
                $s = collect($symbols)
                    // ->dump()
                    ->map(function ($symbol) {
                        return [
                            'text' => $symbol->text,
                            'start' => $symbol->boundingBox->vertices[0]->x ?? 0,
                            'end' => $symbol->boundingBox->vertices[1]->x,
                        ];
                    })
                    ->toObjects();

                return [
                    'text' => $s->join('text'),
                    'start' => $s->first()->start,
                    'end' => $s->last()->end,
                ];
            })
            ->toObjects()
            // ->dd()
            ->reduce(
                function (array $acc, object $s) {
                    // If not punctuation, add space.
                    if (!ctype_punct($s->text)) {
                        return [
                            'last' => $s->end,
                            'str' => $acc['str']->append(" {$s->text}"),
                        ];
                    }

                    // Close punctuation, like commas, don't add space.
                    return [
                        'last' => $s->end,
                        'str' => $acc['str']->append($s->text),
                    ];
                },
                [
                    'last' => 0,
                    'str' => Str::of(''),
                ],
            )
            ['str']->trim()
            ->__toString();

        //OBS: Google doesn't return x=0 or y=0 when text is cut on the side of the image.
        // Extract the coordinates (edges) of the message.
        [$left, $top] = [$bounds[0]->x ?? 0, $bounds[0]->y ?? 0]; // Top left corner.
        [$right, $bottom] = [$bounds[2]->x, $bounds[2]->y]; // Bottom right corner.

        // Calculate the distance between the text to the sides
        // Distance from the end of the message til right side of the image.
        $toRight = $pageWidth - $right;
        // Distance from the start of the message til left side of the image.
        $toLeft = $left;
        // In theory, if the distance is lower to the right side, it's a message from me.
        // If the distance is lower to the left side, it's a message from the other person.

        return GoogleOcrMessage::make([
            'is_me' => $toRight < $toLeft,
            'body' => $text,
            'confidence' => round($paragraph->confidence, 2),
            'width' => abs($left - $right),
            'height' => abs($top - $bottom),
            'bounds' => (object) [
                'left' => $left,
                'top' => $top,
                'right' => $right,
                'bottom' => $bottom,
            ],
            'language' => $lang ?: $pageLang,
        ]);
    }

    /**
     * @param  Collection<int, GoogleOcrMessage>  $messages
     * @return Collection<int, GoogleOcrMessage>
     */
    public static function mergeMultilineMessages(Collection $messages): Collection
    {
        // Let's calculate the line height using the median of all line heights.
        // It'll be used to combine single message that span across multiple lines.
        // We will fetch the line height from the first message because it's the most likely to be the same as the others.
        $lineHeight = $messages
            ->map(fn(object $m) => $m->bounds->top - $m->bounds->bottom)
            ->map('abs')
            ->median();

        // dd($messages);

        $merged = $messages
            // Traverse each message and calculate the y distance between the last message's bottom and the current message's top.
            ->map(function (object $current, $index) use ($messages, $lineHeight) {
                // If it's the first message, skip.
                if ($index === 0) {
                    $current->distance_to_prev = 0;
                    $current->top_to_top = 0;
                    $current->is_the_same_message = false;

                    return $current;
                }

                // Caclulate the distance between the last message's bottom and the current message's top.
                $prev = $messages[$index - 1];
                $distance = abs($prev->bounds->bottom - $current->bounds->top);
                $current->distance_to_prev = $distance;
                // This doesn't take into the account if messages are on the same line.
                // So let's calculate the distance between the top of the message and the top of the previous message.
                $topToTop = abs($prev->bounds->top - $current->bounds->top);
                $current->top_to_top = $topToTop;

                // The name means that the current message is the same as the previous message.
                $current->is_the_same_message =
                    // If the distance is lower than the line height, it's on the same line.
                    // If the distance is higher than the line height, it's on the next line.
                    // And if the top to top distance is higher than the line height, it's on the next line.
                    // Otherwise, it's on the same line but on the different parts on the x axis.
                    $topToTop > $lineHeight && $distance < $lineHeight;

                return $current;
            })
            // Combine the messages that are the same. This was automatically done by the AI. But reviewed by me. Siick.
            ->reduce(function (Collection $acc, object $current) {
                // If the current message is not the same as the previous message, just add it to the collection.
                if ($current->is_the_same_message === false) {
                    return $acc->push($current);
                }
                // If the current message is the same as the previous message, combine them.

                // Get the last message. This will be updated by reference.
                $last = $acc->last();
                // Combine the messages.
                $last->body = trim("{$last->body} {$current->body}");
                // Update the bounds.
                $last->bounds->bottom = $current->bounds->bottom;
                $last->bounds->left = min($last->bounds->left, $current->bounds->left);
                $last->bounds->right = max($last->bounds->right, $current->bounds->right);
                // Update the width.
                $last->width = abs($last->bounds->left - $last->bounds->right);
                // Update the height.
                $last->height = abs($last->bounds->top - $last->bounds->bottom);
                // Update the confidence.
                $last->confidence = ($last->confidence * $current->confidence) / 2;

                return $acc;
            }, collect())
            ->map(function ($m) {
                // Unset the keys that are not needed anymore.
                unset($m->distance_to_prev);
                unset($m->top_to_top);
                unset($m->is_the_same_message);

                return $m;
            });

        return $merged;
    }

    /**
     * @param  Collection<int, GoogleOcrMessage>  $messages
     * @return Collection<int, GoogleOcrMessage>
     */
    public static function enrichMessages(Collection $messages): Collection
    {
        return $messages
            ->map(function (GoogleOcrMessage $m) {
                $m->is_irrelevant = Helpers::isMessageIrrelevant($m->body);

                // Detect short messages like "haha" and set their language to EN
                $m->is_interjection = Helpers::isMessageInterjection($m->body);
                // Detect emoji reactions/likes/datestamps
                $m->is_time = Helpers::isMessageTime($m->body);

                if (
                    // If language is not EN and message is an interjection, set it to EN
                    (strtolower($m->language) !== 'en' && $m->is_interjection) ||
                    // If language is not EN and message is time, set it to EN
                    (strtolower($m->language) !== 'en' && $m->is_time)
                ) {
                    $m->language = 'EN';
                }

                // If message is time, set type to time
                $m->type = $m->is_time ? MessageType::TIMESTAMP : MessageType::TEXT;
                // If message is time, set is_me to false
                $m->is_me = $m->is_time ? false : $m->is_me;

                return $m;
            })
            ->values();
    }

    public function pageToMessageClass(Page $page): Collection
    {
        $messages = $this->toMessages($page);
        $merged = $this->mergeMultilineMessages($messages);
        $enriched = $this->enrichMessages($merged);
        // Remove irrelevant messages
        $filtered = $enriched->where('is_irrelevant', false);

        $pWidth = $page->getWidth();
        $pHeight = $page->getHeight();

        $processed = $filtered->map(
            fn(GoogleOcrMessage $obj, int $index) => new Message([
                'body' => $obj->body,
                'is_me' => $obj->is_me,
                'language' => $obj->language,
                'extra_data' => new ExtraData(
                    box_left: $obj->bounds->left,
                    box_top: $obj->bounds->top,
                    box_right: $obj->bounds->right,
                    box_bottom: $obj->bounds->bottom,

                    is_irrelevant: $obj->is_irrelevant,
                    confidence: $obj->confidence,

                    photo_width: $pWidth,
                    photo_height: $pHeight,
                ),
                'media_id' => $this->media->getKey(),
                'order_column' => $index + 1,
                'source' => MessageSource::OCR,
                'type' => $obj->type,
            ]),
        );

        return $processed;
    }

    public static function parseImageIntoMessages(string $binaryData): Collection
    {
        $client = app(ImageAnnotatorClient::class);

        $self = new self(media: new Media(), convo: new Conversation());
        $self->client = $client;
        $self->imageToAnnotation($binaryData, 600);

        $page = $self->mediaToPage();
        $messages = $self->pageToMessageClass($page);

        return $messages;
    }

    public static function parseImageIntoBlockText(string $binaryData): string
    {
        $client = app(ImageAnnotatorClient::class);

        $self = new self(media: new Media(), convo: new Conversation());
        $self->client = $client;
        $self->imageToAnnotation($binaryData, 600);

        $blockText = $self->annotation->getFullTextAnnotation()->getText();

        return $blockText;
    }
 }
	<?php

	namespace App\Jobs\OCR;

	use App\Models\Media;
	use App\Models\Message;
	use App\Services\Helpers;
	use Laravel\Nova\Makeable;
	use Illuminate\Support\Str;
	use App\Models\Conversation;
	use Illuminate\Bus\Batchable;
	use Illuminate\Bus\Queueable;
	use App\Casts\Message\ExtraData;
	use Google\Cloud\Vision\V1\Page;
	use Illuminate\Support\Collection;
	use Illuminate\Support\Facades\Cache;
	use App\Types\Fluent\GoogleOcrMessage;
	use Illuminate\Queue\SerializesModels;
	use Illuminate\Queue\InteractsWithQueue;
	use App\Types\Message\Type as MessageType;
	use Illuminate\Contracts\Queue\ShouldQueue;
	use Illuminate\Foundation\Bus\Dispatchable;
	use App\Types\Message\Source as MessageSource;
	use Illuminate\Contracts\Queue\ShouldBeUnique;
	use Google\Cloud\Vision\V1\ImageAnnotatorClient;
	use Google\Cloud\Vision\V1\AnnotateImageResponse;

	/**
	* Extract text from a screenshot using Google Cloud Vision OCR.
	*
	* @deprecated Use `ParseMediaViaAI` instead.
	*/
	class ParseScreenshotViaGoogle implements ShouldBeUnique, ShouldQueue
	{
	use Batchable,
	Dispatchable,
	InteractsWithQueue,
	Makeable,
	Queueable,
	SerializesModels;

	public $tries = 3;

	public ImageAnnotatorClient $client;

	public AnnotateImageResponse $annotation;

	public function uniqueId()
	{
	return $this->media->getKey();
	}

	public function __construct(public Conversation $convo, public Media $media)
	{
	}

	// https://cloud.google.com/vision/docs/ocr#detect_text_in_a_local_image

	public function handle(ImageAnnotatorClient $client): void
	{
	if ($this->batch()?->cancelled()) {
	return;
	}

	$this->client = $client;

	$page = $this->mediaToPage();

	Cache::put(
	key: 'vision:' . $this->media->getKey(),
	value: $page->serializeToJsonString(),
	ttl: 3600 * 24 * 7,
	);

	$pWidth = $page->getWidth();
	$pHeight = $page->getHeight();

	// Save image height and width
	$this->media->update([
	'custom_properties->width' => $pWidth,
	'custom_properties->height' => $pHeight,
	]);

	$toSave = $this->pageToMessageClass($page);

	// dd(
	// $toSave
	// //
	// ->toArray(),
	// );

	$this->deletePreviousMessages();
	$this->convo->messages()->saveMany($toSave);
	}

	public function deletePreviousMessages(): void
	{
	$this->media
	->messages()
	->where('source', MessageSource::OCR)
	->withTrashed()
	->get()
	->each->forceDelete();
	}

	public function mediaToAnnotation(): self
	{
	// $cacheKey = 'vision:ocr:' . $this->media->getKey();
	// if (app()->isLocal() && cache()->has($cacheKey)) {
	// return cache()->get($cacheKey);
	// }

	$media = $this->media;
	$url = $media->getTemporaryUrl(
	expiration: now()->addMinutes(5),
	// Food for thought. I could also take raw image, run redact and then OCR.
	// conversionName: $media->hasGeneratedConversion('eco') ? 'eco' : '',
	);

	try {
	$annotation = $this->client->documentTextDetection($url);
	} finally {
	$this->client->close();
	}

	// if (app()->isLocal()) {
	// cache()->put($cacheKey, $annotation, $devCacheTTLHours * 3600);
	// }

	$this->annotation = $annotation;

	return $this;
	}

	public function imageToAnnotation($imageContent, $cacheTTL = false): self
	{
	$cacheKey = 'google:ocr:' . hash('murmur3f', $imageContent);

	if ($cacheTTL !== false && app()->isLocal() && cache()->has($cacheKey)) {
	$this->annotation = cache()->get($cacheKey);

	return $this;
	}

	try {
	$annotation = $this->client->documentTextDetection($imageContent);
	} finally {
	$this->client->close();
	}

	$this->annotation = $annotation;

	if ($cacheTTL !== false && app()->isLocal()) {
	cache()->put($cacheKey, $annotation, $cacheTTL);
	}

	return $this;
	}

	public function mediaToPage(): Page
	{
	$textAnnotation = $this->annotation->getFullTextAnnotation();

	if (!$textAnnotation) {
	throw new \Error('OCR returned no pages for this image.');
	}

	/** @var Page[] $pages */
	$pages = $textAnnotation->getPages();

	return $pages[0];
	}

	/**
	* @return Collection<int, GoogleOcrMessage>
	*/
	public static function toMessages(Page $page): Collection
	{
	$json = json_decode($page->serializeToJsonString());
	// dd($page->getText(), $json);

	$pageWidth = (int) $json->width;
	$pageLang = (string) data_get($json, 'property.detectedLanguages.0.languageCode');
	// $page->getProperty()->getDetectedLanguages()[0]->getLanguageCode();

	// @phpstan-ignore-next-line
	return collect($json->blocks)
	->flatten(0)
	->pluck('paragraphs')
	->flatten(0)
	->map(
	fn($paragraph) => static::processParagraph(
	$paragraph,
	$pageWidth,
	$pageLang,
	),
	)
	// Order messages by their y position.
	->sortBy('bounds.top')
	->values();
	}

	/**
	* @property \Google\Cloud\Vision\V1\Paragraph $paragraph
	*/
	public static function processParagraph(
	object $paragraph,
	int $pageWidth,
	string $pageLang,
	): GoogleOcrMessage {
	/** @var \Google\Cloud\Vision\V1\Vertex $bounds */
	$bounds = $paragraph->boundingBox->vertices;

	$allLangs = data_get(
	$paragraph,
	'words.*.property.detectedLanguages.0.languageCode',
	);
	$lang = collect($allLangs)->unique()->filter()->first();

	$text = collect($paragraph->words)
	->pluck('symbols')
	->map(function ($symbols) {
	// Detect the distance between the characters.
	$s = collect($symbols)
	// ->dump()
	->map(function ($symbol) {
	return [
	'text' => $symbol->text,
	'start' => $symbol->boundingBox->vertices[0]->x ?? 0,
	'end' => $symbol->boundingBox->vertices[1]->x,
	];
	})
	->toObjects();

	return [
	'text' => $s->join('text'),
	'start' => $s->first()->start,
	'end' => $s->last()->end,
	];
	})
	->toObjects()
	// ->dd()
	->reduce(
	function (array $acc, object $s) {
	// If not punctuation, add space.
	if (!ctype_punct($s->text)) {
	return [
	'last' => $s->end,
	'str' => $acc['str']->append(" {$s->text}"),
	];
	}

	// Close punctuation, like commas, don't add space.
	return [
	'last' => $s->end,
	'str' => $acc['str']->append($s->text),
	];
	},
	[
	'last' => 0,
	'str' => Str::of(''),
	],
	)
	['str']->trim()
	->__toString();

	//OBS: Google doesn't return x=0 or y=0 when text is cut on the side of the image.
	// Extract the coordinates (edges) of the message.
	[$left, $top] = [$bounds[0]->x ?? 0, $bounds[0]->y ?? 0]; // Top left corner.
	[$right, $bottom] = [$bounds[2]->x, $bounds[2]->y]; // Bottom right corner.

	// Calculate the distance between the text to the sides
	// Distance from the end of the message til right side of the image.
	$toRight = $pageWidth - $right;
	// Distance from the start of the message til left side of the image.
	$toLeft = $left;
	// In theory, if the distance is lower to the right side, it's a message from me.
	// If the distance is lower to the left side, it's a message from the other person.

	return GoogleOcrMessage::make([
	'is_me' => $toRight < $toLeft,
	'body' => $text,
	'confidence' => round($paragraph->confidence, 2),
	'width' => abs($left - $right),
	'height' => abs($top - $bottom),
	'bounds' => (object) [
	'left' => $left,
	'top' => $top,
	'right' => $right,
	'bottom' => $bottom,
	],
	'language' => $lang ?: $pageLang,
	]);
	}

	/**
	* @param Collection<int, GoogleOcrMessage> $messages
	* @return Collection<int, GoogleOcrMessage>
	*/
	public static function mergeMultilineMessages(Collection $messages): Collection
	{
	// Let's calculate the line height using the median of all line heights.
	// It'll be used to combine single message that span across multiple lines.
	// We will fetch the line height from the first message because it's the most likely to be the same as the others.
	$lineHeight = $messages
	->map(fn(object $m) => $m->bounds->top - $m->bounds->bottom)
	->map('abs')
	->median();

	// dd($messages);

	$merged = $messages
	// Traverse each message and calculate the y distance between the last message's bottom and the current message's top.
	->map(function (object $current, $index) use ($messages, $lineHeight) {
	// If it's the first message, skip.
	if ($index === 0) {
	$current->distance_to_prev = 0;
	$current->top_to_top = 0;
	$current->is_the_same_message = false;

	return $current;
	}

	// Caclulate the distance between the last message's bottom and the current message's top.
	$prev = $messages[$index - 1];
	$distance = abs($prev->bounds->bottom - $current->bounds->top);
	$current->distance_to_prev = $distance;
	// This doesn't take into the account if messages are on the same line.
	// So let's calculate the distance between the top of the message and the top of the previous message.
	$topToTop = abs($prev->bounds->top - $current->bounds->top);
	$current->top_to_top = $topToTop;

	// The name means that the current message is the same as the previous message.
	$current->is_the_same_message =
	// If the distance is lower than the line height, it's on the same line.
	// If the distance is higher than the line height, it's on the next line.
	// And if the top to top distance is higher than the line height, it's on the next line.
	// Otherwise, it's on the same line but on the different parts on the x axis.
	$topToTop > $lineHeight && $distance < $lineHeight;

	return $current;
	})
	// Combine the messages that are the same. This was automatically done by the AI. But reviewed by me. Siick.
	->reduce(function (Collection $acc, object $current) {
	// If the current message is not the same as the previous message, just add it to the collection.
	if ($current->is_the_same_message === false) {
	return $acc->push($current);
	}
	// If the current message is the same as the previous message, combine them.

	// Get the last message. This will be updated by reference.
	$last = $acc->last();
	// Combine the messages.
	$last->body = trim("{$last->body} {$current->body}");
	// Update the bounds.
	$last->bounds->bottom = $current->bounds->bottom;
	$last->bounds->left = min($last->bounds->left, $current->bounds->left);
	$last->bounds->right = max($last->bounds->right, $current->bounds->right);
	// Update the width.
	$last->width = abs($last->bounds->left - $last->bounds->right);
	// Update the height.
	$last->height = abs($last->bounds->top - $last->bounds->bottom);
	// Update the confidence.
	$last->confidence = ($last->confidence * $current->confidence) / 2;

	return $acc;
	}, collect())
	->map(function ($m) {
	// Unset the keys that are not needed anymore.
	unset($m->distance_to_prev);
	unset($m->top_to_top);
	unset($m->is_the_same_message);

	return $m;
	});

	return $merged;
	}

	/**
	* @param Collection<int, GoogleOcrMessage> $messages
	* @return Collection<int, GoogleOcrMessage>
	*/
	public static function enrichMessages(Collection $messages): Collection
	{
	return $messages
	->map(function (GoogleOcrMessage $m) {
	$m->is_irrelevant = Helpers::isMessageIrrelevant($m->body);

	// Detect short messages like "haha" and set their language to EN
	$m->is_interjection = Helpers::isMessageInterjection($m->body);
	// Detect emoji reactions/likes/datestamps
	$m->is_time = Helpers::isMessageTime($m->body);

	if (
	// If language is not EN and message is an interjection, set it to EN
	(strtolower($m->language) !== 'en' && $m->is_interjection) \|\|
	// If language is not EN and message is time, set it to EN
	(strtolower($m->language) !== 'en' && $m->is_time)
	) {
	$m->language = 'EN';
	}

	// If message is time, set type to time
	$m->type = $m->is_time ? MessageType::TIMESTAMP : MessageType::TEXT;
	// If message is time, set is_me to false
	$m->is_me = $m->is_time ? false : $m->is_me;

	return $m;
	})
	->values();
	}

	public function pageToMessageClass(Page $page): Collection
	{
	$messages = $this->toMessages($page);
	$merged = $this->mergeMultilineMessages($messages);
	$enriched = $this->enrichMessages($merged);
	// Remove irrelevant messages
	$filtered = $enriched->where('is_irrelevant', false);

	$pWidth = $page->getWidth();
	$pHeight = $page->getHeight();

	$processed = $filtered->map(
	fn(GoogleOcrMessage $obj, int $index) => new Message([
	'body' => $obj->body,
	'is_me' => $obj->is_me,
	'language' => $obj->language,
	'extra_data' => new ExtraData(
	box_left: $obj->bounds->left,
	box_top: $obj->bounds->top,
	box_right: $obj->bounds->right,
	box_bottom: $obj->bounds->bottom,

	is_irrelevant: $obj->is_irrelevant,
	confidence: $obj->confidence,

	photo_width: $pWidth,
	photo_height: $pHeight,
	),
	'media_id' => $this->media->getKey(),
	'order_column' => $index + 1,
	'source' => MessageSource::OCR,
	'type' => $obj->type,
	]),
	);

	return $processed;
	}

	public static function parseImageIntoMessages(string $binaryData): Collection
	{
	$client = app(ImageAnnotatorClient::class);

	$self = new self(media: new Media(), convo: new Conversation());
	$self->client = $client;
	$self->imageToAnnotation($binaryData, 600);

	$page = $self->mediaToPage();
	$messages = $self->pageToMessageClass($page);

	return $messages;
	}

	public static function parseImageIntoBlockText(string $binaryData): string
	{
	$client = app(ImageAnnotatorClient::class);

	$self = new self(media: new Media(), convo: new Conversation());
	$self->client = $client;
	$self->imageToAnnotation($binaryData, 600);

	$blockText = $self->annotation->getFullTextAnnotation()->getText();

	return $blockText;
	}
	}