Created
June 23, 2021 07:12
-
-
Save troelskn/0ed926153097da9591a8b741a0377644 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Mojibake | |
# UTF-8 content, interpreted as latin1 | |
class Utf8 | |
CANARY = "æøåÆØÅ".chars.map { |c| c.encode(Encoding::UTF_8).force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8) }.freeze | |
def detect?(content) | |
CANARY.any? { |c| content.include?(c) } | |
end | |
def repair(content) | |
content.encode(Encoding::ISO_8859_1).force_encoding(Encoding::UTF_8) | |
end | |
end | |
# Latin1 content, interpreted as UTF-8 | |
class Latin1 | |
CANARY = "æøåÆØÅ".chars.map { |c| c.encode(Encoding::ISO_8859_1).force_encoding(Encoding::UTF_8) }.freeze | |
def detect?(content) | |
CANARY.any? { |c| content.include?(c) } | |
end | |
def repair(content) | |
content.encode(Encoding::UTF_8).force_encoding(Encoding::ISO_8859_1) | |
end | |
end | |
class << self | |
def sniffers | |
[Utf8, Latin1].map(&:new) | |
end | |
def detect?(content) | |
return true if content.encoding == Encoding::ASCII_8BIT | |
sniffers.any? do |sniffer| | |
sniffer.detect?(content) | |
end | |
end | |
def repair(content) | |
content = Mojibake.auto_encode(content) | |
sniffers.each do |sniffer| | |
return sniffer.repair(content) if sniffer.detect?(content) | |
end | |
content | |
end | |
def auto_encode(mixed) | |
return mixed.encode(Encoding::UTF_8) unless mixed.encoding == Encoding::ASCII_8BIT | |
[Encoding::UTF_8, Encoding::ISO_8859_1].each do |encoding| | |
mixed = mixed.dup.force_encoding(encoding) | |
return mixed.encode(Encoding::UTF_8) if mixed.valid_encoding? | |
end | |
raise "Unable to determine encoding of ASCII_8BIT" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment