Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3feba11
reformat script
danlu1 Feb 9, 2026
1c68dac
reorganize code to ensure row columns remain int
danlu1 Feb 10, 2026
4a29a16
add unit test for convert_dtypes_to_json_serializable
danlu1 Feb 10, 2026
3ecb6ec
correct unit for datetime64
danlu1 Feb 10, 2026
af989c0
remove the unwanted code
danlu1 Feb 10, 2026
4d06d3a
revert changes in test_csv_to_pandas_df_with_date_columns
danlu1 Feb 10, 2026
e1b20dc
update doctrings
danlu1 Feb 11, 2026
7ef7110
add integration test for store_rows
danlu1 Feb 12, 2026
a4913a6
add to_csv kwargs to ensure double quote and apostophe formated corre…
danlu1 Feb 16, 2026
98689d3
remove json string dumps function to let synapse decode data directly
danlu1 Feb 16, 2026
a0af1b6
update unit test since the convert_dtypes_to_json_serializable no lon…
danlu1 Feb 17, 2026
5002bd6
update integration test as no json string need to be generated
danlu1 Feb 18, 2026
c874fe4
remvoe unwanted code
danlu1 Feb 24, 2026
dab80f0
simplify test cases
danlu1 Feb 24, 2026
8644201
merge develop branch changes
danlu1 Mar 9, 2026
3412534
add to_csv_kwargs to store_rows function for pandas dataframe
danlu1 Mar 9, 2026
7db7c85
add default to_csv_kwargs for store_row_async
danlu1 Mar 9, 2026
8a75043
set escapechar default value in store_rows_async
danlu1 Mar 9, 2026
d00b30b
add notes to ensure escapechar is set correctly if using custom to_cs…
danlu1 Mar 9, 2026
957b03b
Merge remote-tracking branch 'origin/develop' into SYNPY-1749-allow-q…
BryanFauble May 6, 2026
4fda36a
[SYNPY-1749] Fix mutable default and override-not-merge for to_csv_kw…
BryanFauble May 6, 2026
fbc50ec
[SYNPY-1749] Address PR review feedback on convert_dtypes_to_json_ser…
BryanFauble May 6, 2026
82ac756
[SYNPY-1749] Address Copilot PR review feedback
BryanFauble May 6, 2026
488fffc
Merge branch 'develop' into SYNPY-1749-allow-quote-apostrophe-in-stor…
danlu1 May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions synapseclient/core/upload/multipart_upload_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,8 @@ async def multipart_upload_dataframe_async(
force_restart: True to restart a previously initiated upload from scratch, False
to try to resume.
storage_str: Optional string to append to the upload message.
to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv`
function when writing the data to a CSV file.
"""
trace.get_current_span().set_attributes(
{
Expand Down
87 changes: 77 additions & 10 deletions synapseclient/models/mixins/table_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from io import BytesIO
from typing import Any, Dict, List, Optional, Protocol, Tuple, Union

import pandas as pd
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from typing_extensions import Self
Expand Down Expand Up @@ -138,10 +139,20 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]:
)


def convert_dtypes_to_json_serializable(df):
def convert_dtypes_to_json_serializable(df) -> pd.DataFrame:
"""
Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types.
Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types.
Prepare a DataFrame for JSON/CSV serialization by cleaning special values
and normalizing dtypes. Mutates the passed-in DataFrame in place (and also
returns it).

- Recursively replaces `Ellipsis` with `"..."` and `pd.NA`/`np.nan`/`None`
with `None` inside nested `list`/`dict` values.
- Converts top-level `Ellipsis` to `"..."` and top-level `pd.NA`/`np.nan`/
`None` to `None`.
- Runs `convert_dtypes()` then casts every column to `object` dtype (with
`pd.NA` -> `None`), except `ROW_ID`, `ROW_VERSION`, and `ROW_ID.1`, which
are cast back to `int` since the Synapse API requires them as integers.

Arguments:
Comment thread
BryanFauble marked this conversation as resolved.
df: The dataframe to convert the dtypes of.
Returns:
Expand All @@ -163,16 +174,64 @@ def convert_dtypes_to_json_serializable(df):
"datetime_list_col": [[datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], [datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 6)], None, [datetime(2021, 1, 7), datetime(2021, 1, 8), datetime(2021, 1, 9)]],
"entityid_list_col": [["syn123", "syn456", None], ["syn101", "syn102", "syn103"], None, ["syn104", "syn105", "syn106"]],
"userid_list_col": [["user1", "user2", "user3"], ["user4", "user5", None], None, ["user7", "user8", "user9"]],
"json_col_with_quotes": [
{
"id": 1,
"description": 'Text with "quotes" in the description field',
"references": []
},
{
"id": 2,
"description": 'Another description with "quoted text" here',
"references": ["ref1", "ref2"]
},
{
"id": 3,
"description": 'Description containing "multiple" quoted "words"',
"references": [...]
},
{
"id": 4,
"description": 'Description containing apostrophes sage\'s',
"references": [...]
}

],
}).convert_dtypes()
Comment thread
BryanFauble marked this conversation as resolved.
df = convert_dtypes_to_json_serializable(df)
print(df)
"""
import pandas as pd

def _serialize_json_value(x):
if isinstance(x, (list, dict)):

def _reformat_special_values(obj):
if obj is ...:
return "..."
if isinstance(obj, dict):
return {k: _reformat_special_values(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_reformat_special_values(item) for item in obj]
# Catch pd.NA, np.nan, and None — none are valid JSON
if pd.isna(obj):
return None
return obj

return _reformat_special_values(x)
# Handle standalone ellipsis
if x is ...:
return "..."
# Handle top-level pd.NA, np.nan, None
if pd.isna(x):
return None
return x

for col in df.columns:
df[col] = (
df[col].replace({pd.NA: None}).astype(object)
) # this will convert the int64 and float64 columns to object columns
df[col] = df[col].apply(_serialize_json_value)
# restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype
df[col] = df[col].convert_dtypes()
df[col] = df[col].replace({pd.NA: None}).astype(object)

# Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION)
if col in [
"ROW_ID",
Expand Down Expand Up @@ -2809,7 +2868,6 @@ async def main():
timeout=timeout,
synapse_client=synapse_client,
)

if download_location:
return csv_path

Expand Down Expand Up @@ -3387,7 +3445,9 @@ async def store_rows_async(
function when writing the data to a CSV file. This is only used when
the `values` argument is a Pandas DataFrame. See
<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html>
for complete list of supported arguments.
for complete list of supported arguments. Any kwargs you supply are
merged on top of the default `{"escapechar": "\\"}`, so you only need
to override `escapechar` explicitly if you want different behavior.

job_timeout: The maximum amount of time to wait for a job to complete.
This is used when inserting, and updating rows of data. Each individual
Expand Down Expand Up @@ -3560,6 +3620,8 @@ async def main():
test_import_pandas()
from pandas import DataFrame

to_csv_kwargs = {"escapechar": DEFAULT_ESCAPSE_CHAR, **(to_csv_kwargs or {})}

original_values = values
if isinstance(values, dict):
values = DataFrame(values).convert_dtypes()
Expand Down Expand Up @@ -3786,6 +3848,7 @@ async def _stream_and_update_from_df(
"AppendableRowSetRequest",
]
] = None,
to_csv_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
"""
Organize the process of reading in and uploading parts of the DataFrame we are
Expand Down Expand Up @@ -3816,6 +3879,8 @@ async def _stream_and_update_from_df(
being uploaded.
changes: Additional changes to the table that should
execute within this transaction.
to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv`
function when writing the data to a CSV file.
"""
file_handle_id = await multipart_upload_dataframe_async(
syn=client,
Expand All @@ -3828,6 +3893,7 @@ async def _stream_and_update_from_df(
line_start=line_start,
line_end=line_end,
bytes_to_prepend=header,
to_csv_kwargs=to_csv_kwargs,
)
# We are using a semaphore here because large tables can take a very long time
# for the update to complete. This will allow us to wait for the update to
Expand Down Expand Up @@ -4031,8 +4097,8 @@ async def _chunk_and_upload_df(
to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv`
function when writing the data to a CSV file.
"""
df = convert_dtypes_to_json_serializable(df)
Comment thread
danlu1 marked this conversation as resolved.
# Loop over the rows of the DF to determine the size/boundries we'll be uploading

chunks_to_upload = []
size_of_chunk = 0
buffer = BytesIO()
Expand Down Expand Up @@ -4142,6 +4208,7 @@ async def _chunk_and_upload_df(
header=header_line,
changes=changes,
file_suffix=f"{part}",
to_csv_kwargs=to_csv_kwargs,
)
)
)
Expand Down
Loading
Loading