Created
July 12, 2024 06:28
-
-
Save tsibley/5b088480ce778d238feebfb20db4515c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/augur/io/json.py b/augur/io/json.py | |
index 2a4678ea..5eae5d1b 100644 | |
--- a/augur/io/json.py | |
+++ b/augur/io/json.py | |
@@ -32,7 +32,7 @@ The LICENSE file included in ID3C's repo is copied below verbatim:: | |
SOFTWARE. | |
""" | |
import json | |
-from datetime import datetime | |
+from datetime import date, datetime | |
from typing import Iterable | |
from uuid import UUID | |
@@ -96,10 +96,11 @@ class JsonEncoder(json.JSONEncoder): | |
""" | |
Returns *value* as JSON or raises a TypeError. | |
Serializes: | |
+ * :class:`~datetime.date` using :meth:`~datetime.date.isoformat()` | |
* :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()` | |
* :class:`~uuid.UUID` using ``str()`` | |
""" | |
- if isinstance(value, datetime): | |
+ if isinstance(value, (date, datetime)): | |
return value.isoformat() | |
elif isinstance(value, UUID): | |
diff --git a/augur/io/metadata.py b/augur/io/metadata.py | |
index bcaef5b7..b318abfc 100644 | |
--- a/augur/io/metadata.py | |
+++ b/augur/io/metadata.py | |
@@ -3,9 +3,10 @@ import os | |
from typing import Iterable, Sequence | |
import pandas as pd | |
import pyfastx | |
+import python_calamine as calamine | |
import sys | |
-from io import StringIO | |
-from itertools import chain | |
+from io import StringIO, TextIOWrapper | |
+from itertools import chain, zip_longest | |
from augur.errors import AugurError | |
from augur.io.print import print_err | |
@@ -169,11 +170,12 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER | |
Parameters | |
---------- | |
table: str | |
- Path to a CSV or TSV file or IO buffer | |
+ Path to a CSV, TSV, or Excel file or IO buffer | |
delimiters : list of str | |
List of possible delimiters to check for between columns in the metadata. | |
Only one delimiter will be inferred. | |
+ Ignored if *table* is an Excel file. | |
duplicate_reporting: DataErrorMethod, optional | |
How should duplicate records be reported | |
@@ -197,34 +199,59 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER | |
""" | |
seen_ids = set() | |
duplicate_ids = set() | |
- with open_file(table) as handle: | |
- # Get sample to determine delimiter | |
- table_sample = handle.readline() | |
+ with open_file(table, "rb") as handle: | |
+ columns = None | |
+ records = None | |
+ # Try binary handle as Excel, as long as it's seekable so we can reset | |
+ # to the start on failure. | |
if handle.seekable(): | |
- handle.seek(0) | |
- else: | |
- table_sample_file = StringIO(table_sample) | |
- handle = chain(table_sample_file, handle) | |
- | |
- try: | |
- # Note: this sort of duplicates _get_delimiter(), but it's easier if | |
- # this is separate since it handles non-seekable buffers. | |
- dialect = csv.Sniffer().sniff(table_sample, delimiters) | |
- except csv.Error as error: | |
- # This assumes all csv.Errors imply a delimiter issue. That might | |
- # change in a future Python version. | |
- raise InvalidDelimiter from error | |
- | |
- metadata_reader = csv.DictReader(handle, dialect=dialect) | |
+ try: | |
+ workbook = calamine.load_workbook(handle) | |
+ except calamine.CalamineError: | |
+ handle.seek(0) | |
+ else: | |
+ rows = workbook.get_sheet_by_index(0).to_python() | |
+ columns = rows[0] | |
+ records = ( | |
+ dict(zip_longest(columns, row[:len(columns)])) | |
+ for row | |
+ in rows[1:]) | |
+ | |
+ # Not Excel, so convert handle to text and sniff the delimiter. | |
+ if records is None: | |
+ handle = TextIOWrapper(handle, encoding="utf-8", newline="") | |
+ | |
+ # Get sample to determine delimiter | |
+ table_sample = handle.readline() | |
+ | |
+ if handle.seekable(): | |
+ handle.seek(0) | |
+ else: | |
+ table_sample_file = StringIO(table_sample) | |
+ handle = chain(table_sample_file, handle) | |
+ | |
+ try: | |
+ # Note: this sort of duplicates _get_delimiter(), but it's easier if | |
+ # this is separate since it handles non-seekable buffers. | |
+ dialect = csv.Sniffer().sniff(table_sample, delimiters) | |
+ except csv.Error as error: | |
+ # This assumes all csv.Errors imply a delimiter issue. That might | |
+ # change in a future Python version. | |
+ raise InvalidDelimiter from error | |
+ | |
+ metadata_reader = csv.DictReader(handle, dialect=dialect) | |
+ | |
+ columns, records = metadata_reader.fieldnames, iter(metadata_reader) | |
+ | |
if duplicate_reporting is DataErrorMethod.SILENT: | |
# Directly yield from metadata reader since we do not need to check for duplicate ids | |
- yield from metadata_reader | |
+ yield from records | |
else: | |
if id_column is None: | |
- id_column = metadata_reader.fieldnames[0] | |
+ id_column = columns[0] | |
- for record in metadata_reader: | |
+ for record in records: | |
record_id = record.get(id_column) | |
if record_id is None: | |
raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.") | |
@@ -284,10 +311,11 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co | |
Parameters | |
---------- | |
metadata: str | |
- Path to a CSV or TSV metadata file | |
+ Path to a CSV, TSV, or Excel metadata file or IO buffer | |
metadata_delimiters : list of str | |
List of possible delimiters to check for between columns in the metadata. | |
+ Ignored if *metadata* is an Excel file. | |
fasta: str | |
Path to a plain or gzipped FASTA file | |
diff --git a/setup.py b/setup.py | |
index efc46cd0..770d4197 100644 | |
--- a/setup.py | |
+++ b/setup.py | |
@@ -64,6 +64,7 @@ setuptools.setup( | |
"pandas >=1.0.0, ==1.*", | |
"phylo-treetime >=0.11.2, <0.12", | |
"pyfastx >=1.0.0, <3.0", | |
+ "python_calamine >=0.2.0", | |
"scipy ==1.*", | |
"xopen[zstd] >=1.7.0, ==1.*" | |
], |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment