Skip to content

Commit e5267d6

Browse files
committed
convert test_get_sparse_daatset_rowid_and_ignore_and_target
1 parent 261849b commit e5267d6

1 file changed

Lines changed: 50 additions & 28 deletions

File tree

tests/test_datasets/test_dataset.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -348,34 +348,6 @@ def setUp(self):
348348

349349
self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
350350

351-
352-
def test_get_sparse_dataset_dataframe(self):
353-
rval, *_ = self.sparse_dataset.get_data()
354-
assert isinstance(rval, pd.DataFrame)
355-
np.testing.assert_array_equal(
356-
[pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes),
357-
rval.dtypes,
358-
)
359-
assert rval.shape == (600, 20001)
360-
361-
def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
362-
# TODO: re-add row_id and ignore attributes
363-
self.sparse_dataset.ignore_attribute = ["V256"]
364-
self.sparse_dataset.row_id_attribute = ["V512"]
365-
X, y, categorical, _ = self.sparse_dataset.get_data(
366-
target="class",
367-
include_row_id=False,
368-
include_ignore_attribute=False,
369-
)
370-
assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
371-
# array format returned dense, but now we only return sparse and let the user handle it.
372-
assert isinstance(y.dtypes, pd.SparseDtype)
373-
assert X.shape == (600, 19998)
374-
375-
assert len(categorical) == 19998
376-
self.assertListEqual(categorical, [False] * 19998)
377-
assert y.shape == (600,)
378-
379351
def test_get_sparse_categorical_data_id_395(self):
380352
dataset = openml.datasets.get_dataset(395, download_data=True)
381353
# breakpoint()
@@ -387,8 +359,58 @@ def test_get_sparse_categorical_data_id_395(self):
387359
assert feature.data_type == "nominal"
388360
assert len(feature.nominal_values) == 25
389361

362+
@pytest.mark.production
363+
def test_get_sparse_dataset_rowid_and_ignore_and_target(requests_mock, test_files_directory):
364+
content_file = (
365+
test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" /"data_description.xml"
366+
)
367+
requests_mock.get("https://www.openml.org/api/v1/xml/data/4136", text=content_file.read_text())
368+
sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
369+
370+
# TODO: re-add row_id and ignore attributes
371+
sparse_dataset.ignore_attribute = ["V2"]
372+
sparse_dataset.row_id_attribute = ["V5"]
373+
374+
sparse_arff_file = (
375+
test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" /"sparse_arff.arff"
376+
)
377+
requests_mock.get("https://api.openml.org/data/v1/download/1681111/Dexter.sparse_arff", text = sparse_arff_file.read_text())
378+
379+
X, y, categorical, _ = sparse_dataset.get_data(
380+
target="class",
381+
include_row_id=False,
382+
include_ignore_attribute=False,
383+
)
384+
assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
385+
# array format returned dense, but now we only return sparse and let the user handle it.
386+
assert isinstance(y.dtypes, pd.SparseDtype)
387+
assert X.shape == (10, 8)
390388

389+
assert len(categorical) == 8
390+
assert categorical == [False] * 8
391+
assert y.shape == (10,)
391392

393+
@pytest.mark.production
394+
def test_get_sparse_dataset_dataframe(requests_mock, test_files_directory):
395+
content_file = (
396+
test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" /"data_description.xml"
397+
)
398+
requests_mock.get("https://www.openml.org/api/v1/xml/data/4136", text=content_file.read_text())
399+
sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
400+
401+
sparse_arff_file = (
402+
test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" /"sparse_arff.arff"
403+
)
404+
requests_mock.get("https://api.openml.org/data/v1/download/1681111/Dexter.sparse_arff", text = sparse_arff_file.read_text())
405+
rval, *_ = sparse_dataset.get_data()
406+
407+
assert isinstance(rval, pd.DataFrame)
408+
np.testing.assert_array_equal(
409+
[pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes),
410+
rval.dtypes,
411+
)
412+
assert rval.shape == (10, 11)
413+
392414
@pytest.mark.production
393415
def test_get_sparse_dataset_dataframe_with_target(requests_mock, test_files_directory):
394416

0 commit comments

Comments
 (0)