Last active
August 5, 2024 02:28
-
-
Save pszemraj/086cefc03012ed0b886c2eea6bd11eb3 to your computer and use it in GitHub Desktop.
minimalist dataset viewer based on https://github.com/huggingface/datasets-viewer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import pandas as pd | |
from datasets import load_from_disk | |
import textwrap | |
import json | |
# Constants | |
ROWS_PER_PAGE = 100 | |
LOGO_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png" | |
DOCS_URL = "https://huggingface.co/docs/datasets/index" | |
def load_dataset(path): | |
try: | |
return load_from_disk(path) | |
except Exception as e: | |
st.error(f"Error loading dataset: {str(e)}") | |
return None | |
def render_features(features): | |
if isinstance(features, dict): | |
return {k: render_features(v) for k, v in features.items()} | |
return str(features) | |
def main(): | |
st.set_page_config(page_title="Local Dataset Viewer", layout="wide") | |
# Sidebar | |
st.sidebar.image(LOGO_URL, width=100) | |
st.sidebar.title("Datasets") | |
st.sidebar.markdown("[GitHub](http://github.com/huggingface/datasets)") | |
st.sidebar.markdown( | |
f"[Documentation]({DOCS_URL}) | [Submit dataset](https://huggingface.co/datasets)" | |
) | |
dataset_path = st.sidebar.text_input("Dataset Path:") | |
if not dataset_path: | |
st.warning("Please enter a dataset path.") | |
return | |
dataset = load_dataset(dataset_path) | |
if dataset is None: | |
return | |
# Dataset selection | |
dataset_name = dataset_path.split("/")[-1] | |
st.sidebar.selectbox("Dataset", [dataset_name], key="dataset_select") | |
# Subset selection | |
subsets = list(dataset.keys()) | |
selected_subset = st.sidebar.selectbox("Subset", subsets) | |
# Full View option | |
full_view = st.sidebar.checkbox("Full View") | |
# Keys selection | |
all_keys = list(dataset[selected_subset].features.keys()) | |
selected_keys = st.sidebar.multiselect("Keys", all_keys, default=all_keys) | |
# Main content | |
st.title(f"Dataset: {dataset_name} /{selected_subset}") | |
# Split selection | |
split = st.selectbox("Split", subsets, index=subsets.index(selected_subset)) | |
# Offset | |
total_rows = len(dataset[split]) | |
max_offset = max(0, total_rows - ROWS_PER_PAGE) | |
offset = st.number_input( | |
"Offset", min_value=0, max_value=max_offset, value=0, step=ROWS_PER_PAGE | |
) | |
# Display data | |
end_idx = min(offset + ROWS_PER_PAGE, total_rows) | |
data = dataset[split].select(range(offset, end_idx)) | |
if full_view: | |
for idx, item in enumerate(data): | |
st.subheader(f"Item {offset + idx}") | |
for key in selected_keys: | |
value = item[key] | |
if isinstance(value, str): | |
value = textwrap.fill(value, width=100) | |
elif not isinstance(value, (int, float, bool)): | |
value = json.dumps(value, indent=2) | |
st.text(f"{key}: {value}") | |
st.write("---") | |
else: | |
df = pd.DataFrame(data)[selected_keys] | |
st.dataframe(df) | |
# Overview | |
st.sidebar.subheader("Overview") | |
code_snippet = f""" | |
!pip install datasets | |
from datasets import load_dataset | |
dataset = load_dataset("{dataset_name}", "{selected_subset}") | |
""" | |
st.sidebar.code(code_snippet, language="python") | |
# Dataset description | |
description = dataset[split].info.description | |
st.sidebar.markdown(description) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
datasets | |
pandas | |
streamlit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
usage
Save it as local_dataset_viewer.py.
Make sure you have the required libraries installed:
Run the script using Streamlit:
In the web interface, enter the path to your locally saved dataset.
example ui: