Skip to content

Commit 7dfaf3d

Browse files
Update docs
1 parent 4d9fc9c commit 7dfaf3d

File tree

1 file changed

+3
-177
lines changed

1 file changed

+3
-177
lines changed

README.md

Lines changed: 3 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -23,182 +23,8 @@ Each functionality in learningOrchestra is contained in its own class. Check the
2323

2424
# Example
2525

26-
Shown below is an example usage of learning-orchestra-client using the [Titanic Dataset](https://www.kaggle.com/c/titanic/overview):
26+
* [Here](examples/titanic.py) has an example using the [Titanic Dataset](https://www.kaggle.com/c/titanic/overview):
27+
* [Here](examples/sentiment_analysis.py) has an example using the [Sentiment Analysis On IMDb reviews](https://www.kaggle.com/avnika22/imdb-perform-sentiment-analysis-with-scikit-learn):
28+
* [Here](examples/mnist.py) has an example using the [MNIST Dataset](http://yann.lecun.com/exdb/mnist/):
2729

28-
```python
29-
from learning_orchestra_client import (
30-
dataset,
31-
builder,
32-
transform,
33-
)
3430

35-
cluster_ip = "34.95.187.26"
36-
37-
38-
dataset = Dataset(cluster_ip)
39-
40-
print(dataset.insert_dataset_sync(
41-
"titanic_training",
42-
"https://filebin.net/rpfdy8clm5984a4c/titanic_training.csv?t=gcnjz1yo"))
43-
print(dataset.insert_dataset_sync(
44-
"titanic_testing",
45-
"https://filebin.net/mguee52ke97k0x9h/titanic_testing.csv?t=ub4nc1rc"))
46-
47-
print(dataset.search_all_datasets())
48-
49-
50-
projection = Projection(cluster_ip)
51-
required_columns = [
52-
"PassengerId",
53-
"Pclass",
54-
"Age",
55-
"SibSp",
56-
"Parch",
57-
"Fare",
58-
"Name",
59-
"Sex",
60-
"Embarked",
61-
"Survived"
62-
]
63-
print(projection.insert_dataset_attributes_sync(
64-
"titanic_training",
65-
"titanic_training_projection",
66-
required_columns))
67-
68-
required_columns.remove("Survived")
69-
70-
print(projection.insert_dataset_attributes_sync(
71-
"titanic_testing",
72-
"titanic_testing_projection",
73-
required_columns))
74-
75-
76-
data_type_handler = DataType(cluster_ip)
77-
type_fields = {
78-
"Age": "number",
79-
"Fare": "number",
80-
"Parch": "number",
81-
"PassengerId": "number",
82-
"Pclass": "number",
83-
"SibSp": "number"
84-
}
85-
86-
print(data_type_handler.update_dataset_types(
87-
"titanic_testing_projection",
88-
type_fields))
89-
90-
type_fields["Survived"] = "number"
91-
92-
print(data_type_handler.update_dataset_types(
93-
"titanic_training_projection",
94-
type_fields))
95-
96-
97-
modeling_code = '''
98-
from pyspark.ml import Pipeline
99-
from pyspark.sql.functions import (
100-
mean, col, split,
101-
regexp_extract, when, lit)
102-
103-
from pyspark.ml.feature import (
104-
VectorAssembler,
105-
StringIndexer
106-
)
107-
108-
TRAINING_DF_INDEX = 0
109-
TESTING_DF_INDEX = 1
110-
111-
training_df = training_df.withColumnRenamed('Survived', 'label')
112-
testing_df = testing_df.withColumn('label', lit(0))
113-
datasets_list = [training_df, testing_df]
114-
115-
for index, dataset in enumerate(datasets_list):
116-
dataset = dataset.withColumn(
117-
"Initial",
118-
regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
119-
datasets_list[index] = dataset
120-
121-
misspelled_initials = [
122-
'Mlle', 'Mme', 'Ms', 'Dr',
123-
'Major', 'Lady', 'Countess',
124-
'Jonkheer', 'Col', 'Rev',
125-
'Capt', 'Sir', 'Don'
126-
]
127-
correct_initials = [
128-
'Miss', 'Miss', 'Miss', 'Mr',
129-
'Mr', 'Mrs', 'Mrs',
130-
'Other', 'Other', 'Other',
131-
'Mr', 'Mr', 'Mr'
132-
]
133-
for index, dataset in enumerate(datasets_list):
134-
dataset = dataset.replace(misspelled_initials, correct_initials)
135-
datasets_list[index] = dataset
136-
137-
138-
initials_age = {"Miss": 22,
139-
"Other": 46,
140-
"Master": 5,
141-
"Mr": 33,
142-
"Mrs": 36}
143-
for index, dataset in enumerate(datasets_list):
144-
for initial, initial_age in initials_age.items():
145-
dataset = dataset.withColumn(
146-
"Age",
147-
when((dataset["Initial"] == initial) &
148-
(dataset["Age"].isNull()), initial_age).otherwise(
149-
dataset["Age"]))
150-
datasets_list[index] = dataset
151-
152-
153-
for index, dataset in enumerate(datasets_list):
154-
dataset = dataset.na.fill({"Embarked": 'S'})
155-
datasets_list[index] = dataset
156-
157-
158-
for index, dataset in enumerate(datasets_list):
159-
dataset = dataset.withColumn("Family_Size", col('SibSp')+col('Parch'))
160-
dataset = dataset.withColumn('Alone', lit(0))
161-
dataset = dataset.withColumn(
162-
"Alone",
163-
when(dataset["Family_Size"] == 0, 1).otherwise(dataset["Alone"]))
164-
datasets_list[index] = dataset
165-
166-
167-
text_fields = ["Sex", "Embarked", "Initial"]
168-
for column in text_fields:
169-
for index, dataset in enumerate(datasets_list):
170-
dataset = StringIndexer(
171-
inputCol=column, outputCol=column+"_index").\
172-
fit(dataset).\
173-
transform(dataset)
174-
datasets_list[index] = dataset
175-
176-
177-
non_required_columns = ["Name", "Embarked", "Sex", "Initial"]
178-
for index, dataset in enumerate(datasets_list):
179-
dataset = dataset.drop(*non_required_columns)
180-
datasets_list[index] = dataset
181-
182-
183-
training_df = datasets_list[TRAINING_DF_INDEX]
184-
testing_df = datasets_list[TESTING_DF_INDEX]
185-
186-
assembler = VectorAssembler(
187-
inputCols=training_df.columns[:],
188-
outputCol="features")
189-
assembler.setHandleInvalid('skip')
190-
191-
features_training = assembler.transform(training_df)
192-
(features_training, features_evaluation) =\
193-
features_training.randomSplit([0.8, 0.2], seed=33)
194-
features_testing = assembler.transform(testing_df)
195-
'''
196-
197-
builder = Builder(cluster_ip)
198-
199-
print(builder.run_builder_sync(
200-
"titanic_training_projection",
201-
"titanic_testing_projection",
202-
modeling_code,
203-
["lr", "dt", "gb", "rf", "nb"]))
204-
```

0 commit comments

Comments
 (0)