Last active
March 30, 2024 13:23
-
-
Save Menziess/bfcbea6a309e0990e8c296ce23125059 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def deep_ls(path: str, max_depth=1, reverse=False, key=None, keep_hidden=False): | |
"""List all files in base path recursively. | |
List all files and folders in specified path and subfolders within maximum recursion depth. | |
Parameters | |
---------- | |
path : str | |
The path of the folder from which files are listed | |
max_depth : int | |
The maximum recursion depth | |
reverse : bool | |
As used in `sorted([1, 2], reverse=True)` | |
key : Callable | |
As used in `sorted(['aa', 'aaa'], key=len)` | |
keep_hidden : bool | |
Keep files and folders starting with '_' or '.' | |
Examples | |
-------- | |
>>> from pprint import pprint | |
>>> files = list(deep_ls('/databricks-datasets/asa/airlines')) | |
>>> pprint(files) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS | |
[FileInfo(path='dbfs:/databricks-datasets/asa/airlines/1987.csv', name='1987.csv', size=127162942), | |
... | |
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/2008.csv', name='2008.csv', size=689413344)] | |
>>> first, *_, last = files | |
>>> first | |
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/1987.csv', name='1987.csv', size=127162942) | |
>>> last | |
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/2008.csv', name='2008.csv', size=689413344) | |
""" | |
# Hidden files may be filtered out | |
condition = None if keep_hidden else lambda x: x.name[0] not in ('_', '.') | |
# List all files in path and apply sorting rules | |
li = sorted(filter(condition, dbutils.fs.ls(path)), | |
reverse=reverse, key=key) | |
# Return all files (not ending with '/') | |
for x in li: | |
if x.path[-1] is not '/': | |
yield x | |
# If the max_depth has not been reached, start | |
# listing files and folders in subdirectories | |
if max_depth > 1: | |
for x in li: | |
if x.path[-1] is not '/': | |
continue | |
for y in deep_ls(x.path, max_depth - 1, reverse, key, keep_hidden): | |
yield y | |
# If max_depth has been reached, | |
# return the folders | |
else: | |
for x in li: | |
if x.path[-1] is '/': | |
yield x | |
def key(val): | |
"""Sort function. | |
Takes a filepath: | |
'/mnt/raw/store/item/year=2019/month=6/day=4/' | |
Extracts the integer 4 or returns -1 | |
""" | |
try: | |
return int(list(filter(bool, val.path.split('/'))).pop().split('=').pop()) | |
except ValueError as e: | |
return -1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment