Last active
May 3, 2024 11:10
-
-
Save AntreasAntoniou/dc62fa78bf1d35f67ae92fb0c18aaf5a to your computer and use it in GitHub Desktop.
A script that automates upload and download using parallel workers with the huggingface api
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python Package Imports | |
import os | |
import multiprocessing as mp | |
try: | |
import fire | |
from tqdm.auto import tqdm | |
from huggingface_hub import HfApi, hf_hub | |
except ImportError as e: | |
# Handle missing library | |
missing_lib = str(e).split(' ')[-1] | |
response = input(f"Missing library: {missing_lib}. Would you like to auto-install it? (yes/no): ") | |
if response.lower() == 'yes': | |
# Auto-install library | |
import pip | |
pip.main(['install', missing_lib]) | |
else: | |
# Raise error as library is necessary for script | |
raise ImportError(f"{missing_lib} is required for this script to run.") | |
def upload_directory_to_hf(target_dir, repo_name): | |
""" | |
This function uploads files from a directory to a specified Huggingface dataset repository. | |
Args: | |
target_dir (str): The directory where the files reside. | |
repo_name (str): The name of the Huggingface dataset repository. | |
Returns: | |
None | |
""" | |
# Necessary for HF Dataset Repo Authentication | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" | |
# Instantiate HfApi for uploads | |
api = HfApi() | |
# Walk through directory, upload subdirectories to HF | |
for subdir, dirs, files in tqdm(os.walk(target_dir)): | |
if "subdir" not in subdir: | |
continue | |
api.upload_folder( | |
folder_path=subdir, | |
path_in_repo=f"data/{subdir.split('/')[-1]}", | |
repo_id=repo_name, | |
repo_type="dataset", | |
) | |
print("Upload to Huggingface dataset repository was successful.") | |
def download_dataset_from_hf(repo_name, cache_dir): | |
""" | |
Download dataset from Huggingface. | |
Args: | |
repo_name (str): The name of the Huggingface dataset repository from which to download. | |
cache_dir (str): The directory in which to cache the downloaded files. | |
Returns: | |
None | |
""" | |
# Download dataset snapshot from Huggingface, use multiple workers to sped up the process | |
hf_hub.snapshot_download( | |
repo_id=repo_name, | |
repo_type="dataset", | |
cache_dir=cache_dir, | |
resume_download=True, | |
max_workers=mp.cpu_count(), | |
ignore_patterns=[], | |
) | |
print(f"Download of {repo_name} to {cache_dir} was successful.") | |
if __name__ == '__main__': | |
# Use Google Fire for CLI | |
fire.Fire({ | |
'upload': upload_directory_to_hf, | |
'download': download_dataset_from_hf, | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
HuggingFace Dataset Uploader and Downloader
This is a Python script for easily uploading and downloading datasets to and from HuggingFace's datasets repository. It uses the
HfApi
class from thehuggingface_hub
Python package to make API requests to HuggingFace, andtqdm
for display of progress bars. Argument parsing is done viafire
, which creates a command-line interface from the Python code.Installation
To run the script, you will first need to install necessary Python packages. Use pip to install the following:
Usage
There are two main functions in
script.py
:upload_directory_to_hf
anddownload_dataset_from_hf
.Upload
upload_directory_to_hf
uploads a directory and its contents to a specified HuggingFace dataset repository.To use it, run the following command in your terminal, making sure to replace
/path/to/directory
with the path to the directory you want to upload, andYourRepoName
with the name of your HuggingFace data repository:Download
download_dataset_from_hf
downloads a dataset from a specified HuggingFace dataset repository to a local cache directory.To use it, run the following command in your terminal, making sure to replace
/path/to/cache
with the path to your local cache directory, andYourRepoName
with the name of the HuggingFace data repository that contains the dataset you want to download:Please note that you need to be authenticated with your HuggingFace account on your machine. The
HfApi
uses your cached HuggingFace credentials for authentication. To login to your HuggingFace account, use the following command: