Skip to content

Instantly share code, notes, and snippets.

@tsibley
Created July 12, 2024 06:28
Show Gist options
  • Save tsibley/5b088480ce778d238feebfb20db4515c to your computer and use it in GitHub Desktop.
Save tsibley/5b088480ce778d238feebfb20db4515c to your computer and use it in GitHub Desktop.
diff --git a/augur/io/json.py b/augur/io/json.py
index 2a4678ea..5eae5d1b 100644
--- a/augur/io/json.py
+++ b/augur/io/json.py
@@ -32,7 +32,7 @@ The LICENSE file included in ID3C's repo is copied below verbatim::
SOFTWARE.
"""
import json
-from datetime import datetime
+from datetime import date, datetime
from typing import Iterable
from uuid import UUID
@@ -96,10 +96,11 @@ class JsonEncoder(json.JSONEncoder):
"""
Returns *value* as JSON or raises a TypeError.
Serializes:
+ * :class:`~datetime.date` using :meth:`~datetime.date.isoformat()`
* :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()`
* :class:`~uuid.UUID` using ``str()``
"""
- if isinstance(value, datetime):
+ if isinstance(value, (date, datetime)):
return value.isoformat()
elif isinstance(value, UUID):
diff --git a/augur/io/metadata.py b/augur/io/metadata.py
index bcaef5b7..b318abfc 100644
--- a/augur/io/metadata.py
+++ b/augur/io/metadata.py
@@ -3,9 +3,10 @@ import os
from typing import Iterable, Sequence
import pandas as pd
import pyfastx
+import python_calamine as calamine
import sys
-from io import StringIO
-from itertools import chain
+from io import StringIO, TextIOWrapper
+from itertools import chain, zip_longest
from augur.errors import AugurError
from augur.io.print import print_err
@@ -169,11 +170,12 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
Parameters
----------
table: str
- Path to a CSV or TSV file or IO buffer
+ Path to a CSV, TSV, or Excel file or IO buffer
delimiters : list of str
List of possible delimiters to check for between columns in the metadata.
Only one delimiter will be inferred.
+ Ignored if *table* is an Excel file.
duplicate_reporting: DataErrorMethod, optional
How should duplicate records be reported
@@ -197,34 +199,59 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
"""
seen_ids = set()
duplicate_ids = set()
- with open_file(table) as handle:
- # Get sample to determine delimiter
- table_sample = handle.readline()
+ with open_file(table, "rb") as handle:
+ columns = None
+ records = None
+ # Try binary handle as Excel, as long as it's seekable so we can reset
+ # to the start on failure.
if handle.seekable():
- handle.seek(0)
- else:
- table_sample_file = StringIO(table_sample)
- handle = chain(table_sample_file, handle)
-
- try:
- # Note: this sort of duplicates _get_delimiter(), but it's easier if
- # this is separate since it handles non-seekable buffers.
- dialect = csv.Sniffer().sniff(table_sample, delimiters)
- except csv.Error as error:
- # This assumes all csv.Errors imply a delimiter issue. That might
- # change in a future Python version.
- raise InvalidDelimiter from error
-
- metadata_reader = csv.DictReader(handle, dialect=dialect)
+ try:
+ workbook = calamine.load_workbook(handle)
+ except calamine.CalamineError:
+ handle.seek(0)
+ else:
+ rows = workbook.get_sheet_by_index(0).to_python()
+ columns = rows[0]
+ records = (
+ dict(zip_longest(columns, row[:len(columns)]))
+ for row
+ in rows[1:])
+
+ # Not Excel, so convert handle to text and sniff the delimiter.
+ if records is None:
+ handle = TextIOWrapper(handle, encoding="utf-8", newline="")
+
+ # Get sample to determine delimiter
+ table_sample = handle.readline()
+
+ if handle.seekable():
+ handle.seek(0)
+ else:
+ table_sample_file = StringIO(table_sample)
+ handle = chain(table_sample_file, handle)
+
+ try:
+ # Note: this sort of duplicates _get_delimiter(), but it's easier if
+ # this is separate since it handles non-seekable buffers.
+ dialect = csv.Sniffer().sniff(table_sample, delimiters)
+ except csv.Error as error:
+ # This assumes all csv.Errors imply a delimiter issue. That might
+ # change in a future Python version.
+ raise InvalidDelimiter from error
+
+ metadata_reader = csv.DictReader(handle, dialect=dialect)
+
+ columns, records = metadata_reader.fieldnames, iter(metadata_reader)
+
if duplicate_reporting is DataErrorMethod.SILENT:
# Directly yield from metadata reader since we do not need to check for duplicate ids
- yield from metadata_reader
+ yield from records
else:
if id_column is None:
- id_column = metadata_reader.fieldnames[0]
+ id_column = columns[0]
- for record in metadata_reader:
+ for record in records:
record_id = record.get(id_column)
if record_id is None:
raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.")
@@ -284,10 +311,11 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co
Parameters
----------
metadata: str
- Path to a CSV or TSV metadata file
+ Path to a CSV, TSV, or Excel metadata file or IO buffer
metadata_delimiters : list of str
List of possible delimiters to check for between columns in the metadata.
+ Ignored if *metadata* is an Excel file.
fasta: str
Path to a plain or gzipped FASTA file
diff --git a/setup.py b/setup.py
index efc46cd0..770d4197 100644
--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,7 @@ setuptools.setup(
"pandas >=1.0.0, ==1.*",
"phylo-treetime >=0.11.2, <0.12",
"pyfastx >=1.0.0, <3.0",
+ "python_calamine >=0.2.0",
"scipy ==1.*",
"xopen[zstd] >=1.7.0, ==1.*"
],
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment