Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created August 24, 2024 22:55
Show Gist options
  • Save pszemraj/16d60ed1e9dd95632938457eec3069fe to your computer and use it in GitHub Desktop.
Save pszemraj/16d60ed1e9dd95632938457eec3069fe to your computer and use it in GitHub Desktop.
load the python subset of smolllm-corpus without aws creds
import boto3
import gzip
from datasets import load_dataset
from botocore import UNSIGNED
from botocore.config import Config
num_proc = 32
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
bucket_name = "softwareheritage"
def download_contents(blob_id):
key = f"content/{blob_id}"
obj = s3.get_object(Bucket=bucket_name, Key=key)
with gzip.GzipFile(fileobj=obj["Body"]) as fin:
content = fin.read().decode("utf-8", errors="ignore")
return {"text": content}
ds = load_dataset(
"HuggingFaceTB/smollm-corpus", "python-edu", split="train", num_proc=num_proc
)
ds = ds.map(download_contents, input_columns="blob_id", num_proc=num_proc)
print(ds[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment