Created
October 7, 2022 15:17
-
-
Save denisxab/9c3d1cfbcbabed5fb40a9b2d0cd971a6 to your computer and use it in GitHub Desktop.
Конвертация кодировки у файлов
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pathlib import Path | |
import codecs | |
import chardet | |
from pathlib import Path | |
def get_encoding(path:str)->str: | |
""" | |
Получить кодировку файла, чтобы его можно было коректно прочитать | |
""" | |
return chardet.detect(Path(path).read_bytes()).get('encoding') | |
def convert_charset_file(in_path,out_path=None,in_encode=None,out_encode='utf-8'): | |
in_file=Path(in_path) | |
if not out_path: | |
out_path = Path(in_path).resolve().parent / "out" / Path(in_path).name | |
if not in_encode: | |
in_encode=get_encoding(in_file) | |
if in_encode != out_encode: | |
BLOCKSIZE = 1048576 # or some other, desired size in bytes | |
with codecs.open(in_file, "r", in_encode) as sourceFile: | |
with codecs.open(out_path, "w", out_encode) as targetFile: | |
while True: | |
contents = sourceFile.read(BLOCKSIZE) | |
if not contents: | |
break | |
targetFile.write(contents) | |
# in_rt=in_file.read_text(encoding=in_encode) | |
# out_file=Path(out_path) | |
# out_file.write_text(in_rt,encoding='utf-8') | |
print(f"Перезаписан {in_path} из {in_encode} > {out_encode}") | |
def convert_charset_dir(in_path): | |
for p in os.listdir(in_path): | |
if (Path(in_path)/Path(p)).is_file(): | |
convert_charset_file(Path(in_path)/p,in_encode='windows-1251') | |
convert_charset_dir('in') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment