sklearn 输入包含 NaN、无穷大或超出 dtype(‘float64’) 范围的值。

huangapple go评论93阅读模式
英文:

sklearn Input contains NaN, infinity or a value too large for dtype('float64')

问题

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression

movdata = pd.read_csv('Movie_collection.csv')
movdata

movdata1 = pd.get_dummies(movdata)
movdata1

x = movdata1.loc[:, movdata1.columns != 'Start_Tech_Oscar']
y = movdata1['Start_Tech_Oscar']
x.head()

y.head()

clsmov = LogisticRegression()
clsmov.fit(x, y)
英文:

code:

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
movdata = pd.read_csv('Movie_collection.csv')
movdata
movdata

Collection Marketing expense Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating Critic_rating Trailer_views 3D_available Time_taken Twitter_hastags Genre Avg_age_actors MPAA_film_rating Num_multiplex Start_Tech_Oscar
0 48000 20.1264 59.62 0.462 36524.125 138.7 7.825 8.095 7.910 7.995 7.94 527367 YES 109.60 223.840 Thriller 23 PG 494 0
1 43200 20.5462 69.14 0.531 35668.655 152.4 7.505 7.650 7.440 7.470 7.44 494055 NO 146.64 243.456 Drama 42 PG 462 1
2 69400 20.5458 69.14 0.531 39912.675 134.6 7.485 7.570 7.495 7.515 7.44 547051 NO 147.88 2022.400 Comedy 38 PG 458 0
3 66800 20.6474 59.36 0.542 38873.890 119.3 6.895 7.035 6.920 7.020 8.26 516279 YES 185.36 225.344 Drama 45 PG 472 0
4 72400 21.3810 59.36 0.542 39701.585 127.7 6.920 7.070 6.815 7.070 8.26 531448 NO 176.48 225.792 Drama 55 PG 395 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 44800 21.2526 78.86 0.427 36624.115 142.6 8.680 8.775 8.620 8.970 6.80 492480 NO 186.96 243.584 Action 27 PG 561 1
502 41200 20.9054 78.86 0.427 33996.600 150.2 8.780 8.945 8.770 8.930 7.80 482875 YES 132.24 263.296 Action 20 PG 600 1
503 47800 21.2152 78.86 0.427 38751.680 164.5 8.830 8.970 8.855 9.010 7.80 532239 NO 109.56 243.824 Comedy 31 PG 576 1
504 44000 22.1918 78.86 0.427 37740.670 162.8 8.730 8.845 8.800 8.845 6.80 496077 YES 158.80 303.520 Comedy 47 PG 607 1
505 38000 20.9482 78.86 0.427 33496.650 154.3 8.640 8.880 8.680 8.790 6.80 518438 YES 205.60 203.040 Comedy 45 PG 604 1
506 rows × 20 columns

movdata1 = pd.get_dummies(movdata)
movdata1

Collection Marketing expense Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating ... Avg_age_actors Num_multiplex Start_Tech_Oscar 3D_available_NO 3D_available_YES Genre_Action Genre_Comedy Genre_Drama Genre_Thriller MPAA_film_rating_PG
0 48000 20.1264 59.62 0.462 36524.125 138.7 7.825 8.095 7.910 7.995 ... 23 494 0 0 1 0 0 0 1 1
1 43200 20.5462 69.14 0.531 35668.655 152.4 7.505 7.650 7.440 7.470 ... 42 462 1 1 0 0 0 1 0 1
2 69400 20.5458 69.14 0.531 39912.675 134.6 7.485 7.570 7.495 7.515 ... 38 458 0 1 0 0 1 0 0 1
3 66800 20.6474 59.36 0.542 38873.890 119.3 6.895 7.035 6.920 7.020 ... 45 472 0 0 1 0 0 1 0 1
4 72400 21.3810 59.36 0.542 39701.585 127.7 6.920 7.070 6.815 7.070 ... 55 395 0 1 0 0 0 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 44800 21.2526 78.86 0.427 36624.115 142.6 8.680 8.775 8.620 8.970 ... 27 561 1 1 0 1 0 0 0 1
502 41200 20.9054 78.86 0.427 33996.600 150.2 8.780 8.945 8.770 8.930 ... 20 600 1 0 1 1 0 0 0 1
503 47800 21.2152 78.86 0.427 38751.680 164.5 8.830 8.970 8.855 9.010 ... 31 576 1 1 0 0 1 0 0 1
504 44000 22.1918 78.86 0.427 37740.670 162.8 8.730 8.845 8.800 8.845 ... 47 607 1 0 1 0 1 0 0 1
505 38000 20.9482 78.86 0.427 33496.650 154.3 8.640 8.880 8.680 8.790 ... 45 604 1 0 1 0 1 0 0 1
506 rows × 24 columns

x= movdata1.loc[:,movdata1.columns!='Start_Tech_Oscar']
y = movdata1['Start_Tech_Oscar']
x.head()

Collection Marketing expense Production expense Multiplex coverage Budget Movie_length Lead_ Actor_Rating Lead_Actress_rating Director_rating Producer_rating ... Twitter_hastags Avg_age_actors Num_multiplex 3D_available_NO 3D_available_YES Genre_Action Genre_Comedy Genre_Drama Genre_Thriller MPAA_film_rating_PG
0 48000 20.1264 59.62 0.462 36524.125 138.7 7.825 8.095 7.910 7.995 ... 223.840 23 494 0 1 0 0 0 1 1
1 43200 20.5462 69.14 0.531 35668.655 152.4 7.505 7.650 7.440 7.470 ... 243.456 42 462 1 0 0 0 1 0 1
2 69400 20.5458 69.14 0.531 39912.675 134.6 7.485 7.570 7.495 7.515 ... 2022.400 38 458 1 0 0 1 0 0 1
3 66800 20.6474 59.36 0.542 38873.890 119.3 6.895 7.035 6.920 7.020 ... 225.344 45 472 0 1 0 0 1 0 1
4 72400 21.3810 59.36 0.542 39701.585 127.7 6.920 7.070 6.815 7.070 ... 225.792 55 395 1 0 0 0 1 0 1
5 rows × 23 columns

y.head()

0 0
1 1
2 0
3 0
4 0
Name: Start_Tech_Oscar, dtype: int64
clsmov = LogisticRegression()
clsmov.fit(x,y)

ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_6736\2033513903.py in <module>
----> 1 clsmov.fit(x,y)

~\anaconda3\lib\site-packages\sklearn\linear_model_logistic.py in fit(self, X, y, sample_weight)
1506 _dtype = [np.float64, np.float32]
1507
-> 1508 X, y = self._validate_data(
1509 X,
1510 y,

~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
583

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
962 raise ValueError("y cannot be None")
963
--> 964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
798
799 if force_all_finite:
--> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
801
802 if ensure_min_samples > 0:

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
112 ):
113 type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114 raise ValueError(
115 msg_err.format(
116 type_err, msg_dtype if msg_dtype is not None else X.dtype

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
``

答案1

得分: 0

以下是翻译好的部分:

You might have faced the problem that your dataframe might have null data or infinite data

Check it using this command (for nan)

使用以下命令来检查(对于nan)

np.any(np.isnan(mat))

And this command for infinite data

以及这个命令用于无穷大数据

np.all(np.isfinite(mat))

You can try to change your data in someway and get rid of (or replace) missing data

英文:

You might have faced the problem that your dataframe might have null data or infinite data

Check it using this command (for nan)

np.any(np.isnan(mat))

And this command for infinite data

np.all(np.isfinite(mat))

You can try to change your data in someway and get rid of (or replace) missing data

huangapple
  • 本文由 发表于 2023年3月7日 02:01:46
  • 转载请务必保留本文链接:https://go.coder-hub.com/75654287.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定