@@ -23,182 +23,8 @@ Each functionality in learningOrchestra is contained in its own class. Check the
2323
2424# Example
2525
26- Shown below is an example usage of learning-orchestra-client using the [ Titanic Dataset] ( https://www.kaggle.com/c/titanic/overview ) :
26+ * [ Here] ( examples/titanic.py ) has an example using the [ Titanic Dataset] ( https://www.kaggle.com/c/titanic/overview ) :
27+ * [ Here] ( examples/sentiment_analysis.py ) has an example using the [ Sentiment Analysis On IMDb reviews] ( https://www.kaggle.com/avnika22/imdb-perform-sentiment-analysis-with-scikit-learn ) :
28+ * [ Here] ( examples/mnist.py ) has an example using the [ MNIST Dataset] ( http://yann.lecun.com/exdb/mnist/ ) :
2729
28- ``` python
29- from learning_orchestra_client import (
30- dataset,
31- builder,
32- transform,
33- )
3430
35- cluster_ip = " 34.95.187.26"
36-
37-
38- dataset = Dataset(cluster_ip)
39-
40- print (dataset.insert_dataset_sync(
41- " titanic_training" ,
42- " https://filebin.net/rpfdy8clm5984a4c/titanic_training.csv?t=gcnjz1yo" ))
43- print (dataset.insert_dataset_sync(
44- " titanic_testing" ,
45- " https://filebin.net/mguee52ke97k0x9h/titanic_testing.csv?t=ub4nc1rc" ))
46-
47- print (dataset.search_all_datasets())
48-
49-
50- projection = Projection(cluster_ip)
51- required_columns = [
52- " PassengerId" ,
53- " Pclass" ,
54- " Age" ,
55- " SibSp" ,
56- " Parch" ,
57- " Fare" ,
58- " Name" ,
59- " Sex" ,
60- " Embarked" ,
61- " Survived"
62- ]
63- print (projection.insert_dataset_attributes_sync(
64- " titanic_training" ,
65- " titanic_training_projection" ,
66- required_columns))
67-
68- required_columns.remove(" Survived" )
69-
70- print (projection.insert_dataset_attributes_sync(
71- " titanic_testing" ,
72- " titanic_testing_projection" ,
73- required_columns))
74-
75-
76- data_type_handler = DataType(cluster_ip)
77- type_fields = {
78- " Age" : " number" ,
79- " Fare" : " number" ,
80- " Parch" : " number" ,
81- " PassengerId" : " number" ,
82- " Pclass" : " number" ,
83- " SibSp" : " number"
84- }
85-
86- print (data_type_handler.update_dataset_types(
87- " titanic_testing_projection" ,
88- type_fields))
89-
90- type_fields[" Survived" ] = " number"
91-
92- print (data_type_handler.update_dataset_types(
93- " titanic_training_projection" ,
94- type_fields))
95-
96-
97- modeling_code = '''
98- from pyspark.ml import Pipeline
99- from pyspark.sql.functions import (
100- mean, col, split,
101- regexp_extract, when, lit)
102-
103- from pyspark.ml.feature import (
104- VectorAssembler,
105- StringIndexer
106- )
107-
108- TRAINING_DF_INDEX = 0
109- TESTING_DF_INDEX = 1
110-
111- training_df = training_df.withColumnRenamed('Survived', 'label')
112- testing_df = testing_df.withColumn('label', lit(0))
113- datasets_list = [training_df, testing_df]
114-
115- for index, dataset in enumerate(datasets_list):
116- dataset = dataset.withColumn(
117- "Initial",
118- regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
119- datasets_list[index] = dataset
120-
121- misspelled_initials = [
122- 'Mlle', 'Mme', 'Ms', 'Dr',
123- 'Major', 'Lady', 'Countess',
124- 'Jonkheer', 'Col', 'Rev',
125- 'Capt', 'Sir', 'Don'
126- ]
127- correct_initials = [
128- 'Miss', 'Miss', 'Miss', 'Mr',
129- 'Mr', 'Mrs', 'Mrs',
130- 'Other', 'Other', 'Other',
131- 'Mr', 'Mr', 'Mr'
132- ]
133- for index, dataset in enumerate(datasets_list):
134- dataset = dataset.replace(misspelled_initials, correct_initials)
135- datasets_list[index] = dataset
136-
137-
138- initials_age = {"Miss": 22,
139- "Other": 46,
140- "Master": 5,
141- "Mr": 33,
142- "Mrs": 36}
143- for index, dataset in enumerate(datasets_list):
144- for initial, initial_age in initials_age.items():
145- dataset = dataset.withColumn(
146- "Age",
147- when((dataset["Initial"] == initial) &
148- (dataset["Age"].isNull()), initial_age).otherwise(
149- dataset["Age"]))
150- datasets_list[index] = dataset
151-
152-
153- for index, dataset in enumerate(datasets_list):
154- dataset = dataset.na.fill({"Embarked": 'S'})
155- datasets_list[index] = dataset
156-
157-
158- for index, dataset in enumerate(datasets_list):
159- dataset = dataset.withColumn("Family_Size", col('SibSp')+col('Parch'))
160- dataset = dataset.withColumn('Alone', lit(0))
161- dataset = dataset.withColumn(
162- "Alone",
163- when(dataset["Family_Size"] == 0, 1).otherwise(dataset["Alone"]))
164- datasets_list[index] = dataset
165-
166-
167- text_fields = ["Sex", "Embarked", "Initial"]
168- for column in text_fields:
169- for index, dataset in enumerate(datasets_list):
170- dataset = StringIndexer(
171- inputCol=column, outputCol=column+"_index").\
172- fit(dataset).\
173- transform(dataset)
174- datasets_list[index] = dataset
175-
176-
177- non_required_columns = ["Name", "Embarked", "Sex", "Initial"]
178- for index, dataset in enumerate(datasets_list):
179- dataset = dataset.drop(*non_required_columns)
180- datasets_list[index] = dataset
181-
182-
183- training_df = datasets_list[TRAINING_DF_INDEX]
184- testing_df = datasets_list[TESTING_DF_INDEX]
185-
186- assembler = VectorAssembler(
187- inputCols=training_df.columns[:],
188- outputCol="features")
189- assembler.setHandleInvalid('skip')
190-
191- features_training = assembler.transform(training_df)
192- (features_training, features_evaluation) =\
193- features_training.randomSplit([0.8, 0.2], seed=33)
194- features_testing = assembler.transform(testing_df)
195- '''
196-
197- builder = Builder(cluster_ip)
198-
199- print (builder.run_builder_sync(
200- " titanic_training_projection" ,
201- " titanic_testing_projection" ,
202- modeling_code,
203- [" lr" , " dt" , " gb" , " rf" , " nb" ]))
204- ```
0 commit comments