英文:
sklearn transformer for outlier removal - returning xy?
问题
以下是您要翻译的代码部分:
from sklearn.datasets import make_classification
X1, y1 = make_classification(n_samples=100, n_features=10, n_informative=5, n_classes=3)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest
import numpy as np
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X, y)
working = IsolationForestOutlierRemover().fit_transform(X1, y1)
working[0].shape
# 95
working
# %%
pipelinet = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
notworking = pipelinet.fit(X1, y1)
notworking
请注意,代码中的一些 HTML 编码符号(例如 "
)可能需要进行修复,以确保代码的正确性。
英文:
I am trying to remove rows that are labeled outliers. I have this partially working, but not in the context of a pipeline and I am not sure why.
from sklearn.datasets import make_classification
X1, y1 = make_classification(n_samples=100, n_features=10, n_informative=5, n_classes=3)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest
import numpy as np
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X, y)
working = IsolationForestOutlierRemover().fit_transform(X1, y1)
working[0].shape
# 95
working
# %%
pipelinet = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
notworking = pipelinet.fit(X1, y1)
notworking
Getting the following error:
ValueError Traceback (most recent call last)
/home/mmann1123/Documents/github/YM_TZ_crop_classifier/4_model.py in line 10
349 # %%
351 pipelinet = Pipeline(
352 [
353 ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
354 ("random_forest", RandomForestClassifier()),
355 ]
356 )
---> 358 notworking = pipelinet.fit(X1, y1)
359 notworking
File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/pipeline.py:406, in Pipeline.fit(self, X, y, **fit_params)
404 if self._final_estimator != "passthrough":
405 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 406 self._final_estimator.fit(Xt, y, **fit_params_last_step)
408 return self
File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:346, in BaseForest.fit(self, X, y, sample_weight)
344 if issparse(y):
345 raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 346 X, y = self._validate_data(
347 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
348 )
...
--> 185 array = numpy.asarray(array, order=order, dtype=dtype)
186 return xp.asarray(array, copy=copy)
187 else:
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 95) + inhomogeneous part.
答案1
得分: 1
RandomForestClassifier
需要 fit
方法的两个数组 X 和 y。在异常值移除后,经过转换的 X 和 y 需要传递给管道中的下一步操作,但是你当前在 IsolationForestOutlierRemover
类中的 transform 方法在 y 不为 None 时返回一个单一元组,这是导致问题的原因。
要修复这个问题,你需要更新管道以正确地将 X 和 y 传递给 RandomForestClassifier
。有几种方法可以做到这一点;我使用了覆盖的方式。
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y, **fit_params)
return self.transform(X, y)
pipeline = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
pipeline.fit(X1, y1)
需要注意的一点是... fit_transform
在 sklearn
的 Pipeline
对象的 fit
调用中被使用。fit()
也仅在最终的估算器上被调用。
英文:
I don't have your specific package versions, and I am not using conda
, but I was able to replicate your problem and fix it.
RandomForestClassifier
expects two arrays X and y for the fit
method. After the outlier removal, the transformed X and y need to be passed to the next step in the pipeline, but your current transform method in the IsolationForestOutlierRemover
class returns a single tuple when y is not None, which is causing the issue.
To fix this, you need to update the Pipeline
to correctly pass the X and y to the RandomForestClassifier
. There are a couple of ways to do this; I did it with overriding.
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y, **fit_params)
return self.transform(X, y)
pipeline = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
pipeline.fit(X1, y1)
One thing to note... fit_transform
is used during the fit
call of the Pipeline
object from sklearn
. fit()
is also only called for the final estimator.
答案2
得分: 0
错误是因为 transform 方法中输入和输出数组的形状不匹配。你应该返回 None 和 X。以下是修改后的代码:
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask], None
请注意,这是你提供的代码的中文翻译。如果有其他需要,请告诉我。
英文:
The error you encountered is due to the shape mismatch between the input and output arrays in the transform method. You should return None along with X.
Here is the modified code.
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask], None
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论