Created
March 21, 2018 10:10
-
-
Save pochemuto/32ea6245a10fee5956b2d9e9125d7068 to your computer and use it in GitHub Desktop.
Cleanup ypp podcast archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf | |
from mutagen.mp3 import MP3 | |
from mutagen.easyid3 import EasyID3 | |
from mutagen.id3 import ID3, TIT2, TIT3, TDES, TDRL, TimeStampTextFrame, ID3TimeStamp | |
from mutagen.id3._util import ID3NoHeaderError | |
from os import path | |
from datetime import datetime | |
import os | |
import re | |
import shutil | |
from datetime import timedelta | |
class NotFound(Exception): | |
def __init__(self, message): | |
super(Exception, self).__init__(message) | |
class Notes: | |
date_pattern = re.compile('(\d+) (\w+) (\d+) (\d+):(\d+)') | |
months = ['января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря'] | |
def __init__(self): | |
self.title = '' | |
self.date = '' | |
self.text = '' | |
self.number = 0 | |
def date_from_str(self, strdate): | |
m = type(self).date_pattern.match(strdate) | |
month_str = m.group(2).lower() | |
month = type(self).months.index(month_str) + 1 | |
self.date = datetime(int(m.group(3)), month, int(m.group(1)), int(m.group(4)), int(m.group(5))) | |
def __repr__(self): | |
return '{0} [{1}]: {2}'.format(self.title, self.date, self.text) | |
def get_number(title, mp3_path): | |
title = title.strip() | |
match = re.match('(Встреча|ЯПП|YPP|Выпуск|Пьянка|Подкаст|Шоу|Budam|Встерча|Янки после пьянки|ЯПП - После РТ|Будам)?\s*[#№]?([0-9]+)', title, flags=re.I) | |
if match: | |
return int(match.group(2)) | |
match = re.match('Пьянка\s+#([0-9]+)', title) | |
if match: | |
return int(match.group(1)) | |
if 'сорок четыре' in title: | |
return 44 | |
if 'Представляюсь и ругаюсь' == title: | |
return 1 | |
if 'ЯПП и Будам - Зачем нужны мужчины' == title: | |
return 270 | |
if 'ЯПП и Будам - О музыке, птичках и собачках' == title: | |
return 525 | |
if 'Ученые записки' == title: | |
return 384 | |
if 'ЯПП и Будам - Что такое цивилизованная страна' == title: | |
return 534 | |
if 'Записал подкаст Будам, то да се мы обсуждам' == title: | |
return -1 # файл все равно отсутствует | |
if 'Ученые записки № 2. О семье и браке' == title: | |
return 387 | |
match = re.search('ypp([0-9]+)', mp3_path, re.I) | |
if match: | |
return int(match.group(1)) | |
raise Exception("couldn't get number from " + title) | |
class Mp3: | |
def __init__(self, filepath): | |
self.filepath = filepath | |
try: | |
self.mp3 = EasyID3(filepath) | |
except ID3NoHeaderError: | |
self.mp3 = EasyID3() | |
def get(self, tag): | |
try: | |
return self.mp3[tag] | |
except KeyError: | |
return None | |
def save(self): | |
filepath = self.filepath | |
return self.mp3.save(filename=filepath, v1=2, v2_version=3) | |
def set(self, tag, value): | |
self.mp3[tag] = value | |
def read(filepath, mp3_path=None): | |
data = open(filepath, encoding='cp1251').readlines() | |
notes = Notes() | |
notes.title = data[0].strip() | |
notes.date_from_str(data[1].strip()) | |
notes.text = ''.join(data[3:]).strip() | |
notes.number = get_number(notes.title, mp3_path) | |
return notes | |
def get_file_name(podcast_path): | |
files = os.listdir(podcast_path) | |
for filename in files: | |
if filename.endswith('.mp3'): | |
return path.realpath(path.join(podcast_path, filename)) | |
raise NotFound('mp3 file not found in ' + podcast_path) | |
def pprint(mp3): | |
for tag in ['title', 'artist', 'date', 'description', 'releasedate']: | |
print('{0}: {1}'.format(tag, mp3.get(tag))) | |
def folder(podcast_path): | |
print('process ' + podcast_path) | |
mp3_path = get_file_name(podcast_path) | |
notes = read(path.join(podcast_path, 'text.txt'), mp3_path) | |
mp3 = Mp3(mp3_path) | |
mp3.set('title', notes.title) | |
mp3.set('artist', 'Янки после пьянки') | |
mp3.set('releasedate', notes.date.strftime('%Y-%m-%d %H:%M:%S')) | |
mp3.set('date', notes.date.strftime('%Y-%m-%d %H:%M:%S')) | |
mp3.set('description', notes.text) | |
mp3.set('title3', notes.text) | |
mp3.set('tracknumber', str(notes.number)) | |
mp3.save() | |
print(mp3_path) | |
new_name = notes.date.strftime('%Y-%m-%d') + ' - ' + notes.title + '.mp3' | |
new_path = os.path.join(os.path.dirname(podcast_path), new_name) | |
if mp3_path != podcast_path: | |
shutil.move(mp3_path, new_path) | |
print('moved to ' + new_path) | |
def main(root_dir='.'): | |
print('processing ' + root_dir) | |
EasyID3.RegisterTextKey('description', 'TDES') | |
EasyID3.RegisterTextKey('title3', 'TIT3') | |
EasyID3.RegisterTextKey('releasedate', 'TDRL') | |
errors = 0 | |
deleted = 0 | |
processed = 0 | |
for filename in sorted(os.listdir(root_dir)): | |
filename = os.path.join(root_dir, filename) | |
if path.isdir(filename): | |
if filename.endswith('_1') and path.isdir(filename[:-2]): | |
shutil.rmtree(filename) | |
print('deleted ' + filename) | |
deleted += 1 | |
continue | |
# folder(path.join(root_dir, filename)) | |
try: | |
folder(path.join(root_dir, filename)) | |
processed += 1 | |
except NotFound as e: | |
errors += 1 | |
print('####### ' + filename + ': ' + str(type(e)) + " " + str(e)) | |
except Exception as e: | |
print(filename) | |
raise e | |
if errors > 0: | |
print(f'got {errors} errors') | |
if deleted > 0: | |
print(f'deleted {errors} duplicates') | |
if processed > 0: | |
print(f'processed {processed} files') | |
print('done') | |
def bitrate(root_dir='.'): | |
def action(filename): | |
print(path.basename(filename) + ' ' + str(int(MP3(filename).info.bitrate / 1000)) + ' kbps') | |
process(action, root_dir) | |
def length(root_dir='.'): | |
def action(filename, context): | |
return context + MP3(filename).info.length | |
total = process(action, root_dir, context=0) | |
td = timedelta(seconds=total) | |
hours = td.seconds / 60 / 60 | |
minutes = (td.seconds / 60) % 60 | |
seconds = td.seconds % 60 | |
print('{}:{}:{}'.format(int(td.days * 24 + hours), minutes, seconds)) | |
def clean_filename(root_dir='.'): | |
def action(filename): | |
chars = r'\/:*?"<>|–' | |
for ch in chars: | |
if ch in path.basename(filename): | |
print(f'{filename} contains "{ch}"') | |
match = re.search(r'[^,()№ё!A-Za-z\d#.А-Яа-я- ]+', path.basename(filename)) | |
if match: | |
print(f'{filename} contains something "{match.group()}"') | |
process(action, root_dir) | |
def process(action, root_dir, context=None): | |
dirs = [] | |
for filename in sorted(os.listdir(root_dir)): | |
filename = os.path.join(root_dir, filename) | |
if path.isdir(filename): | |
dirs.append(filename) | |
elif path.isfile(filename) and filename.endswith('.mp3'): | |
if context is not None: | |
context = action(filename, context) | |
else: | |
action(filename) | |
for d in dirs: | |
if context is not None: | |
context = action(filename, context) | |
else: | |
action(filename) | |
return context | |
if __name__ == '__main__': | |
#main() | |
bitrate() | |
clean_filename() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment