英文:
How can I show the distribution of the columns of a dataset, ordering the images in a specific way?
问题
我正在尝试显示数据集的列的分布,其中我有以下变量:['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
,每个变量都有一项测量,每个测量都是数据集的一列:
{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperatureDewPoint', 'minTemperatureDewPoint', 'maxTemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
我想要绘制它们,其中每行图形代表一个变量,每列图形代表一项测量。
我尝试了以下代码:
variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
variable_columns = {
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperatureDewPoint', 'minTemperatureDewPoint', 'maxTemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
for i, var_type in enumerate(variable_types):
columns = variable_columns[var_type]
print(columns)
fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
axes = axes.flat
data = METEO_COMPLETO[columns]
ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
print(ordered_columns)
data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[i].set_title(f'Distribution of {var_type}')
axes[i].set_xlabel('Value')
axes[i].set_ylabel('Frequency')
axes[i].legend(ordered_columns)
plt.tight_layout()
plt.show()
但我得到了以下结果(最后4个变量):
另一个可能的选择是在同一图上显示可用的测量,使用不同的颜色来查看每个变量的最小值、均值和最大值的分布。
英文:
I am trying to show the distribution of the columns of a dataset where I have these variables: ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
and from each one a measure, each of these measures is a column of the dataset:
{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meantemperatureDewPoint', 'mintemperatureDewPoint', 'maxtemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed'].
}
I want to plot them where each row of graphs is a variable and each column of graphs is a measurement.
I tried that:
variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
variable_columns = {
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperaturedewpoint', 'minTemperaturedewpoint', 'maxTemperaturedewpoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
for i, var_type in enumerate(variable_types):
columns = variable_columns[var_type]
print(columns)
fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
axes = axes.flat
data = METEO_COMPLETO[columns]
ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
print(ordered_columns)
data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[i].set_title(f'Distribución de {var_type}')
axes[i].set_xlabel('Valor')
axes[i].set_ylabel('Frecuencia')
axes[i].legend(ordered_columns)
plt.tight_layout()
plt.show()
But I got that result (last 4 variables):
Another possible option is to display the available measures in the same graph with different colors to see the distributions of the minimum, mean and maximum value of each variable as a whole.
答案1
得分: 1
以下是您要翻译的内容:
对我来说,通过嵌套两个for循环更容易保持这种图表排列的“网格思维”,一个处理图表的行(即“variable”),另一个处理图表的列(我已经命名为“statistics”)。
然而,整个图表变得太大,无法得到良好的格式,所以我想这不是以单个图表呈现数据的最佳方式。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
variable_columns = {
"precip": ["meanPrecip", "minPrecip", "maxPrecip"],
"pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
"pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
"relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
"snow": ["meanSnow", "minSnow", "maxSnow"],
"temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
"temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
"temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
"uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
"visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
"windDirection": ["meanWinddirection"],
"windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
}
# 生成假数据
meteo_data = {}
for cols in variable_columns.values():
for col in cols:
meteo_data[col] = np.random.randn(1000)
METEO_COMPLETO = pd.DataFrame(meteo_data)
# 统计列表以获取正确的图表网格和列名
statistics = ["max", "mean", "min"]
fig, axes = plt.subplots(
nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
)
# 循环遍历压缩的图表行和变量名称
for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
# 循环遍历行中压缩的特定图表和相应的统计名称
for ax, stat in zip(axes_row, statistics):
meteo_column = stat + variable.title() # 首字母大写
try:
data = METEO_COMPLETO[meteo_column]
except KeyError:
# 找不到统计数据(例如maxWinddirection
# => 移除轴并继续下一个图表
ax.axis("off")
continue
ax.hist(data)
ax.set_title(f"分布 {variable}")
ax.set_xlabel("值")
ax.set_ylabel("频率")
ax.legend(col)
if i_row == 0:
ax.set_title(stat)
plt.tight_layout()
plt.show()
英文:
For me it is easier to keep the "grid-thought" of such a plot arrangement by nesting two for loops, one dealing with the rows of the plot (i.e. "variable"), and one dealing with the columns of the plot (i have named that "statistics").
However the whole plot gets too large to get nicely formatted, so I guess this is not the best way to present the data in a single plot.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
variable_columns = {
"precip": ["meanPrecip", "minPrecip", "maxPrecip"],
"pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
"pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
"relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
"snow": ["meanSnow", "minSnow", "maxSnow"],
"temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
"temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
"temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
"uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
"visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
"windDirection": ["meanWinddirection"],
"windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
}
# generate fake data
meteo_data = {}
for cols in variable_columns.values():
for col in cols:
meteo_data[col] = np.random.randn(1000)
METEO_COMPLETO = pd.DataFrame(meteo_data)
# statistics list to get correct plot grid and column name
statistics = ["max", "mean", "min"]
fig, axes = plt.subplots(
nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
)
# loop over zipped plot rows and variable names
for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
# loop over zipped particular plots in row and respective statistic name
for ax, stat in zip(axes_row, statistics):
meteo_column = stat + variable.title() # capitalize first letter
try:
data = METEO_COMPLETO[meteo_column]
except KeyError:
# statistics not found (e.g. maxWinddirection
# => remove axis and go to next plot
ax.axis("off")
continue
ax.hist(data)
ax.set_title(f"Distribución de {variable}")
ax.set_xlabel("Valor")
ax.set_ylabel("Frecuencia")
ax.legend(col)
if i_row == 0:
ax.set_title(stat)
plt.tight_layout()
plt.show()
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论