Last active
February 29, 2020 23:37
-
-
Save detain/df1cb0cccc55a01e77bf928db7394b0b to your computer and use it in GitHub Desktop.
Recursivly Iterates a directory finding MP3s and then loading the ID3 info via 'mediainfo' for each. it then builds a list of genre popularity by artist (how many songs for each artist use which genre) working with genre beingset to multiple genres and split with either a semicolon (;) or foreslash (/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Recursivly Iterates a directory finding MP3s and then loading the ID3 info via 'mediainfo' for each. | |
* it then builds a list of genre popularity by artist (how many songs for each artist use which genre) | |
* working with genre beingset to multiple genres and split with either a semicolon (;) or foreslash (/) | |
* | |
* - Automatic Backups every 120 seconds (configurable) | |
* @see $backupSeconds | |
* - Intelligent Reprocessing of MP3s only updating when the "Last Modified Time" is newer than what we stored in the db | |
* - Commands Optimization drastically cutting back on the number of times "mediainfo" is ran | |
* say for 2500 files to update updates (originally 2500 calls to update them) it grouped | |
* them into a single call per Genre so for me it went down from around 2500 to 80. | |
* - Automatic handling of removed files without having to specificly check for them. | |
* | |
* @author Joe Huss <detain@interserver.net> | |
* @copyright 2020 | |
* * | |
*/ | |
$mp3Base = __DIR__; | |
$found = 0; | |
$badGenres = ['Other', '!', 'Unknown', 'Miscellaneous']; | |
$toFix = []; | |
$artistCounts = []; | |
$backupSeconds = 120; | |
exec('which mediainfo', $out, $missingMediaInfo); | |
unset($out); | |
exec('which mp3info2', $out, $missingMp3Info2); | |
unset($out); | |
if ($missingMediaInfo + $missingMp3Info2 > 0) { | |
if ($missingMediaInfo == 1) { | |
echo '[ERROR] Missing "mediainfo" ! Get it from https://mediaarea.net/en/MediaInfo or "apt install mediainfo"'.PHP_EOL; | |
} | |
if ($missingMp3Info2 == 1) { | |
echo '[ERROR] Missing "mp3info2" ! Get it from perl MP3::Tag module at https://metacpan.org/release/MP3-Tag or "apt install libmp3-tag-perl"'.PHP_EOL; | |
} | |
exit; | |
} | |
if (file_exists('mp3s.json')) { | |
$mp3s = json_decode(file_get_contents('mp3s.json'), true); | |
echo "Loaded Stored Data for ".count($mp3s)." MP3s\n"; | |
} else { | |
$mp3s = []; | |
} | |
echo "Scanning Path '{$mp3Base}'..."; | |
$cmd = 'find '.escapeshellarg($mp3Base).' -type f -name "*.mp3"'; | |
$scanTime = time(); | |
$lastBackup = $scanTime; | |
$files = explode("\n", trim(`$cmd`)); | |
$total = count($files); | |
echo $total.' files'.PHP_EOL; | |
foreach ($files as $file) { | |
$modified = filemtime($file); | |
$needsUpdate = true; | |
if (array_key_exists($file, $mp3s)) { | |
if ($modified <= $mp3s[$file]['modified']) { | |
$needsUpdate = false; | |
} | |
} | |
if ($needsUpdate === true) { | |
$cmd = 'mediainfo --Output=JSON '.escapeshellarg($file); | |
$data = json_decode(trim(`$cmd`), true); | |
//echo "[$found/$total] Loading Media Info for {$file}\n"; | |
} | |
$data['modified'] = $modified; | |
$data['seen'] = $scanTime; | |
$mp3s[$file] = $data; | |
$found++; | |
if (time() - $lastBackup > $backupSeconds) { | |
echo '['.$found.'/'.$total.'] Backup Interval triggered saving the current data...'; | |
$bytes = file_put_contents('mp3s.json', json_encode($mp3s, JSON_PRETTY_PRINT)); | |
echo $bytes.' bytes written to "mp3s.json"'.PHP_EOL; | |
$lastBackup = time(); | |
} | |
} | |
echo "File Info Loaded from ".count($mp3s)." MP3s\n"; | |
$bytes = file_put_contents('mp3s.json', json_encode($mp3s, JSON_PRETTY_PRINT)); | |
echo $bytes.' bytes written to "mp3s.json"'.PHP_EOL; | |
echo "Examining the MP3s for Genres and generating some mapping info".PHP_EOL; | |
foreach ($mp3s as $mp3 => $data) { | |
$types = []; | |
foreach ($data['media']['track'] as $idx => $track) { | |
$types[$track['@type']] = $idx; | |
} | |
if (!isset($data['media']['track'][$types['General']]['Performer'])) { | |
continue; | |
} | |
if (!isset($data['media']['track'][$types['General']]['Genre'])) { | |
$toFix[] = $mp3; | |
continue; | |
} | |
//echo "MP3: $mp3\n"; | |
//print_r($data['media']['track'][$types['General']]); | |
$artist = $data['media']['track'][$types['General']]['Performer']; | |
$genres = $data['media']['track'][$types['General']]['Genre']; | |
if (!array_key_exists($artist, $artistCounts)) { | |
$artistCounts[$artist] = []; | |
} | |
if (in_array($genres, $badGenres)) { | |
$toFix[] = $mp3; | |
} elseif (strpos($genres, ';') !== false) { | |
$genres = explode(';', $genres); | |
$toFix[] = $mp3; | |
} elseif (strpos($genres, '/') !== false) { | |
$genres = explode('/', $genres); | |
$toFix[] = $mp3; | |
} else { | |
$genres = [$genres]; | |
} | |
foreach ($genres as $genre) { | |
$genre = trim($genre); | |
if (!array_key_exists($genre, $artistCounts[$artist])) { | |
$artistCounts[$artist][$genre] = 0; | |
} | |
$artistCounts[$artist][$genre]++; | |
} | |
} | |
echo count($toFix)." mp3 files found with multiple genres to fix\n"; | |
foreach ($artistCounts as $artist => $genres) { | |
$highestCount = 0; | |
$highestGenre = false; | |
foreach ($genres as $genre => $count) { | |
if ($count > $highestCount) { | |
$highestGenre = $genre; | |
$highestCount = $count; | |
} | |
} | |
echo 'Found artist '.$artist.' to most likely be "'.$highestGenre.'" with a frequency of '.$highestCount.' among '.count($genres).' detected genres'.PHP_EOL; | |
$artistCounts[$artist] = $highestGenre; | |
} | |
$updates = []; | |
foreach ($toFix as $mp3) { | |
$data = $mp3s[$mp3]; | |
$types = []; | |
foreach ($data['media']['track'] as $idx => $track) { | |
$types[$track['@type']] = $idx; | |
} | |
$artist = $data['media']['track'][$types['General']]['Performer']; | |
if (!array_key_exists($artist, $artistCounts)) { | |
//echo "Artist $artist had no genres for MP3 $mp3\n"; | |
continue; | |
} | |
$updates[$mp3] = $artistCounts[$artist]; | |
} | |
//file_put_contents('updates.json', json_encode($updates, JSON_PRETTY_PRINT)); | |
//echo "wrote updates.json with ".count($updates)." fixs\n"; | |
$mp3s = $updates; | |
echo "Grouping ".count($mp3s)." MP3s to update by Genres\n"; | |
$genres = []; | |
foreach ($mp3s as $mp3 => $genre) { | |
if (!isset($genres[$genre])) { | |
$genres[$genre] = []; | |
} | |
$genres[$genre][] = $mp3; | |
} | |
echo "Sorted them down to ".count($genres)." genres to update\n"; | |
$cmds = []; | |
foreach ($genres as $genre => $files) { | |
$cmd = 'mp3info2 -g '.escapeshellarg($genre); | |
foreach ($files as $file) { | |
$cmd .= ' '.escapeshellarg($file); | |
} | |
$cmds[] = $cmd; | |
} | |
file_put_contents('update_mp3s.sh', implode("\n", $cmds)); | |
echo "Done Updating ".count($updates)." MP3s\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment