@@ -348,34 +348,6 @@ def setUp(self):
348348
349349 self .sparse_dataset = openml .datasets .get_dataset (4136 , download_data = False )
350350
351-
352- def test_get_sparse_dataset_dataframe (self ):
353- rval , * _ = self .sparse_dataset .get_data ()
354- assert isinstance (rval , pd .DataFrame )
355- np .testing .assert_array_equal (
356- [pd .SparseDtype (np .float32 , fill_value = 0.0 )] * len (rval .dtypes ),
357- rval .dtypes ,
358- )
359- assert rval .shape == (600 , 20001 )
360-
361- def test_get_sparse_dataset_rowid_and_ignore_and_target (self ):
362- # TODO: re-add row_id and ignore attributes
363- self .sparse_dataset .ignore_attribute = ["V256" ]
364- self .sparse_dataset .row_id_attribute = ["V512" ]
365- X , y , categorical , _ = self .sparse_dataset .get_data (
366- target = "class" ,
367- include_row_id = False ,
368- include_ignore_attribute = False ,
369- )
370- assert all (dtype == pd .SparseDtype (np .float32 , fill_value = 0.0 ) for dtype in X .dtypes )
371- # array format returned dense, but now we only return sparse and let the user handle it.
372- assert isinstance (y .dtypes , pd .SparseDtype )
373- assert X .shape == (600 , 19998 )
374-
375- assert len (categorical ) == 19998
376- self .assertListEqual (categorical , [False ] * 19998 )
377- assert y .shape == (600 ,)
378-
379351 def test_get_sparse_categorical_data_id_395 (self ):
380352 dataset = openml .datasets .get_dataset (395 , download_data = True )
381353 # breakpoint()
@@ -387,8 +359,58 @@ def test_get_sparse_categorical_data_id_395(self):
387359 assert feature .data_type == "nominal"
388360 assert len (feature .nominal_values ) == 25
389361
362+ @pytest .mark .production
363+ def test_get_sparse_dataset_rowid_and_ignore_and_target (requests_mock , test_files_directory ):
364+ content_file = (
365+ test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" / "data_description.xml"
366+ )
367+ requests_mock .get ("https://www.openml.org/api/v1/xml/data/4136" , text = content_file .read_text ())
368+ sparse_dataset = openml .datasets .get_dataset (4136 , download_data = False )
369+
370+ # TODO: re-add row_id and ignore attributes
371+ sparse_dataset .ignore_attribute = ["V2" ]
372+ sparse_dataset .row_id_attribute = ["V5" ]
373+
374+ sparse_arff_file = (
375+ test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" / "sparse_arff.arff"
376+ )
377+ requests_mock .get ("https://api.openml.org/data/v1/download/1681111/Dexter.sparse_arff" , text = sparse_arff_file .read_text ())
378+
379+ X , y , categorical , _ = sparse_dataset .get_data (
380+ target = "class" ,
381+ include_row_id = False ,
382+ include_ignore_attribute = False ,
383+ )
384+ assert all (dtype == pd .SparseDtype (np .float32 , fill_value = 0.0 ) for dtype in X .dtypes )
385+ # array format returned dense, but now we only return sparse and let the user handle it.
386+ assert isinstance (y .dtypes , pd .SparseDtype )
387+ assert X .shape == (10 , 8 )
390388
389+ assert len (categorical ) == 8
390+ assert categorical == [False ] * 8
391+ assert y .shape == (10 ,)
391392
393+ @pytest .mark .production
394+ def test_get_sparse_dataset_dataframe (requests_mock , test_files_directory ):
395+ content_file = (
396+ test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" / "data_description.xml"
397+ )
398+ requests_mock .get ("https://www.openml.org/api/v1/xml/data/4136" , text = content_file .read_text ())
399+ sparse_dataset = openml .datasets .get_dataset (4136 , download_data = False )
400+
401+ sparse_arff_file = (
402+ test_files_directory / "mock_responses" / "datasets" / "sparse_dataset" / "sparse_arff.arff"
403+ )
404+ requests_mock .get ("https://api.openml.org/data/v1/download/1681111/Dexter.sparse_arff" , text = sparse_arff_file .read_text ())
405+ rval , * _ = sparse_dataset .get_data ()
406+
407+ assert isinstance (rval , pd .DataFrame )
408+ np .testing .assert_array_equal (
409+ [pd .SparseDtype (np .float32 , fill_value = 0.0 )] * len (rval .dtypes ),
410+ rval .dtypes ,
411+ )
412+ assert rval .shape == (10 , 11 )
413+
392414@pytest .mark .production
393415def test_get_sparse_dataset_dataframe_with_target (requests_mock , test_files_directory ):
394416
0 commit comments