Skip to content

Instantly share code, notes, and snippets.

@tinkernels
Last active January 28, 2021 10:20
Show Gist options
  • Save tinkernels/1d62207797a640abb1d8797a29f5f4e0 to your computer and use it in GitHub Desktop.
Save tinkernels/1d62207797a640abb1d8797a29f5f4e0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
from pathlib import Path
from chardet.universaldetector import UniversalDetector
from binaryornot.check import is_binary
base_path = "."
encoding_map = {
"utf-8-sig": "utf-8"
}
binary_extnames = set()
txt_extnames = set()
encodings = set()
files_manipulated = []
failed_files = []
except_extnames = ['.orig', '.iconv']
convert_extnames = ['.txt', '.toml', '.py', '.in', '.sh', '.bat', '.h', '.c', '.cpp', '.hpp', '.pyx', '']
detector = UniversalDetector()
def convert_file(fpath:str):
global encodings
global files_manipulated
global encoding_map
global failed_files
global detector
detector.reset()
with open(fpath, "rb") as fc:
for line in fc:
detector.feed(line)
if detector.done:
break
detector.close()
encodings.add(detector.result['encoding'])
if detector.result['confidence'] > .6:
encoding_ = detector.result['encoding']
# if encoding_.lower() == "utf-8":
# return
if encoding_.lower() in encoding_map:
encoding_ = encoding_map[encoding_.lower()]
ret = os.system(
f'iconv -c -f {encoding_} -t utf-8 "{fpath}" > "{fpath}.iconv"')
if ret != 0:
os.system(f'cp -n "{fpath}" "{fpath}.orig"')
failed_files.append(
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath})
os.system(f'mv "{fpath}.iconv" "{fpath}"')
os.system(
f'vim "+set ff=unix nobomb tabstop=4 shiftwidth=4 expandtab" "+retab" "+wq" "{fpath}"')
files_manipulated.append(
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath})
else:
failed_files.append(
{"confidence": f"{detector.result['confidence']:.2f}", "encoding": detector.result['encoding'], "path": fpath})
for dirName, subdirList, fileList in os.walk(base_path):
for fname in fileList:
f_path = f"{Path(base_path).joinpath(dirName).joinpath(fname)}"
suffix_ = Path(f_path).suffix.lower()
if is_binary(f_path):
binary_extnames.add(suffix_)
else:
txt_extnames.add(suffix_)
if suffix_ not in except_extnames and suffix_ in convert_extnames:
convert_file(fpath=f_path)
print(f"{'-'*32}\nconverted file:")
[print(f"{f_['confidence']} {f_['encoding']}\t{f_['path']}")
for f_ in files_manipulated]
print(f"{'-'*32}\nconvert failed file:")
[print(f"{f_['confidence']} {f_['encoding']}\t{f_['path']}")
for f_ in failed_files]
print(f"{'-'*32}\nbinary file ext: {binary_extnames}")
print(f"{'-'*32}\ntxt file ext: {txt_extnames}")
print(f"{'-'*32}\ndetected encodings: {encodings}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment