Created
August 24, 2024 22:55
-
-
Save pszemraj/16d60ed1e9dd95632938457eec3069fe to your computer and use it in GitHub Desktop.
load the python subset of smolllm-corpus without aws creds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import gzip | |
from datasets import load_dataset | |
from botocore import UNSIGNED | |
from botocore.config import Config | |
num_proc = 32 | |
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) | |
bucket_name = "softwareheritage" | |
def download_contents(blob_id): | |
key = f"content/{blob_id}" | |
obj = s3.get_object(Bucket=bucket_name, Key=key) | |
with gzip.GzipFile(fileobj=obj["Body"]) as fin: | |
content = fin.read().decode("utf-8", errors="ignore") | |
return {"text": content} | |
ds = load_dataset( | |
"HuggingFaceTB/smollm-corpus", "python-edu", split="train", num_proc=num_proc | |
) | |
ds = ds.map(download_contents, input_columns="blob_id", num_proc=num_proc) | |
print(ds[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment