Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active August 5, 2024 02:28
Show Gist options
  • Save pszemraj/086cefc03012ed0b886c2eea6bd11eb3 to your computer and use it in GitHub Desktop.
Save pszemraj/086cefc03012ed0b886c2eea6bd11eb3 to your computer and use it in GitHub Desktop.
minimalist dataset viewer based on https://github.com/huggingface/datasets-viewer
import streamlit as st
import pandas as pd
from datasets import load_from_disk
import textwrap
import json
# Constants
ROWS_PER_PAGE = 100
LOGO_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png"
DOCS_URL = "https://huggingface.co/docs/datasets/index"
def load_dataset(path):
try:
return load_from_disk(path)
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
return None
def render_features(features):
if isinstance(features, dict):
return {k: render_features(v) for k, v in features.items()}
return str(features)
def main():
st.set_page_config(page_title="Local Dataset Viewer", layout="wide")
# Sidebar
st.sidebar.image(LOGO_URL, width=100)
st.sidebar.title("Datasets")
st.sidebar.markdown("[GitHub](http://github.com/huggingface/datasets)")
st.sidebar.markdown(
f"[Documentation]({DOCS_URL}) | [Submit dataset](https://huggingface.co/datasets)"
)
dataset_path = st.sidebar.text_input("Dataset Path:")
if not dataset_path:
st.warning("Please enter a dataset path.")
return
dataset = load_dataset(dataset_path)
if dataset is None:
return
# Dataset selection
dataset_name = dataset_path.split("/")[-1]
st.sidebar.selectbox("Dataset", [dataset_name], key="dataset_select")
# Subset selection
subsets = list(dataset.keys())
selected_subset = st.sidebar.selectbox("Subset", subsets)
# Full View option
full_view = st.sidebar.checkbox("Full View")
# Keys selection
all_keys = list(dataset[selected_subset].features.keys())
selected_keys = st.sidebar.multiselect("Keys", all_keys, default=all_keys)
# Main content
st.title(f"Dataset: {dataset_name} /{selected_subset}")
# Split selection
split = st.selectbox("Split", subsets, index=subsets.index(selected_subset))
# Offset
total_rows = len(dataset[split])
max_offset = max(0, total_rows - ROWS_PER_PAGE)
offset = st.number_input(
"Offset", min_value=0, max_value=max_offset, value=0, step=ROWS_PER_PAGE
)
# Display data
end_idx = min(offset + ROWS_PER_PAGE, total_rows)
data = dataset[split].select(range(offset, end_idx))
if full_view:
for idx, item in enumerate(data):
st.subheader(f"Item {offset + idx}")
for key in selected_keys:
value = item[key]
if isinstance(value, str):
value = textwrap.fill(value, width=100)
elif not isinstance(value, (int, float, bool)):
value = json.dumps(value, indent=2)
st.text(f"{key}: {value}")
st.write("---")
else:
df = pd.DataFrame(data)[selected_keys]
st.dataframe(df)
# Overview
st.sidebar.subheader("Overview")
code_snippet = f"""
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("{dataset_name}", "{selected_subset}")
"""
st.sidebar.code(code_snippet, language="python")
# Dataset description
description = dataset[split].info.description
st.sidebar.markdown(description)
if __name__ == "__main__":
main()
datasets
pandas
streamlit
@pszemraj
Copy link
Author

pszemraj commented Aug 5, 2024

usage

  1. Save it as local_dataset_viewer.py.

  2. Make sure you have the required libraries installed:

    
    pip install streamlit pandas datasets
    
    
  3. Run the script using Streamlit:

    
    streamlit run local_dataset_viewer.py
    
    
  4. In the web interface, enter the path to your locally saved dataset.

example ui:

viewer

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment