J'ai reçu une ValueError ci-dessous lorsque j'essaie de soumettre mon Pipeline à une niveleuse. Et je ne sais pas où je suis censé raser 12500 lignes de données.

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.

J'ai été chargé de créer un modèle qui combine les caractéristiques commerciales des maisons de soins infirmiers avec les résultats de leur sondage du cycle 1, ainsi que le temps entre les sondages du cycle 1 et du cycle 2 pour prédire le score total du cycle 2.

C'est mon code que j'utilise pour accomplir la tâche ci-dessus.

# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        self.col_1 = X[self.t1_col].apply(pd.to_datetime)
        self.col_2 = X[self.t2_col].apply(pd.to_datetime)
        return self

    def transform(self, X):
        difference_list = []
        difference = self.col_1 - self.col_2
        for obj in difference:
            difference_list.append(obj.total_seconds())
        return np.array(difference_list).reshape(-1,1)

# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)

# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
                'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
                'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
    ('cst2', ColumnSelectTransformer(cycle_1_cols)),
    ])

# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming 
# and combining the business features, cycle_1 features as well as time   
# feature; followed by fitting the transformed features into a            
# RandomForestRegressor
survey_model = Pipeline([
    ('features', FeatureUnion([
        ('business', business_features),
        ('survey', cycle_1_features),
        ('time', time_feature),
    ])),
    ('forest', RandomForestRegressor()),
])

# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))

# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)

Le pipeline équipé ressemble à ceci

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('business',
                                                 FeatureUnion(n_jobs=None,
                                                              transformer_list=[('simple',
                                                                                 Pipeline(memory=None,
                                                                                          steps=[('cst',
                                                                                                  ColumnSelectTransformer(columns=['BEDCERT',
                                                                                                                                   'RESTOT',
                                                                                                                                   'INHOSP',
                                                                                                                                   'CCRC_FACIL',
                                                                                                                                   'SFF',
                                                                                                                                   'CHOW_LAST_12MOS',
                                                                                                                                   'SPRINKLER_STATUS',
                                                                                                                                   'EXP_TOTAL',
                                                                                                                                   'ADJ_TOTAL'])),
                                                                                                 ('imputer',
                                                                                                  SimpleImpute...
                              transformer_weights=None, verbose=False)),
                ('forest',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=10, n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False))],
         verbose=False)

Un contexte supplémentaire: je construis ce modèle pour que sa méthode de prédiction soit passée dans une niveleuse personnalisée pour un projet. Le correcteur passe une liste de dictionnaires à la méthode Predict ou Predict_Proba de mon estimateur, pas un DataFrame. Cela signifie que le modèle doit fonctionner avec les deux types de données. Pour cette raison, je dois fournir un ColumnSelectTransformer personnalisé pour utiliser à la place le propre ColumnTransformer de scikit-learn.

Vous trouverez ci-dessous du code supplémentaire lié aux fonctionnalités métier et ColumnSelectTransformer

# Custom transformer to select columns from a dataframe and returns the   
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X[self.columns].values

simple_features = Pipeline([
    ('cst', ColumnSelectTransformer(simple_cols)),
    ('imputer', SimpleImputer(strategy='mean')),
])

owner_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['OWNERSHIP'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

cert_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['CERTIFICATION'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

categorical_features = FeatureUnion([
    ('owner_onehot', owner_onehot),
    ('cert_onehot', cert_onehot),
])

business_features = FeatureUnion([
    ('simple', simple_features),
    ('categorical', categorical_features)
])

Enfin, voici l'erreur complète soulevée

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)

/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
     92   def __getattr__(self, method):
     93     def func(*args, **kw):
---> 94       return self(method, *args, **kw)
     95     return func
     96 

/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
     88       return
     89     test_cases = json.loads(resp.text)
---> 90     test_cases_grading(question_name, func, test_cases)
     91 
     92   def __getattr__(self, method):

/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
     40   for test_case in test_cases:
     41     if inspect.isroutine(func):
---> 42       sub_res = func(*test_case['args'], **test_case['kwargs'])
     43     elif not test_case['args'] and not test_case['kwargs']:
     44       sub_res = func

/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
    114 
    115         # lambda, but not partial, allows help() to work with update_wrapper
--> 116         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
    117         # update the docstring of the returned function
    118         update_wrapper(out, self.fn)

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
    419         Xt = X
    420         for _, name, transform in self._iter(with_final=False):
--> 421             Xt = transform.transform(Xt)
    422         return self.steps[-1][-1].predict(Xt, **predict_params)
    423 

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
    963             return np.zeros((X.shape[0], 0))
    964         if any(sparse.issparse(f) for f in Xs):
--> 965             Xs = sparse.hstack(Xs).tocsr()
    966         else:
    967             Xs = np.hstack(Xs)

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
    463 
    464     """
--> 465     return bmat([blocks], format=format, dtype=dtype)
    466 
    467 

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
    584                                                     exp=brow_lengths[i],
    585                                                     got=A.shape[0]))
--> 586                     raise ValueError(msg)
    587 
    588                 if bcol_lengths[j] == 0:

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
0
Dumb chimp 11 oct. 2019 à 11:41

1 réponse

Meilleure réponse

La réparation de mon TimeDelta Transformer a aidé.


class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
        array_list = []
        for x in timedelta_series:
            array_list.append(x.total_seconds())
        return np.array(array_list).reshape(-1,1)


0
Dumb chimp 18 oct. 2019 à 18:45