英文:
How can I show the distribution of the columns of a dataset, ordering the images in a specific way?
问题
我正在尝试显示数据集的列分布,其中我有以下变量:['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
,每个变量都有一个测量值,每个测量值都是数据集的一列:
{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meantemperatureDewPoint', 'mintemperatureDewPoint', 'maxtemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
我想绘制它们,其中每一行的图形是一个变量,每一列的图形是一个测量值。
我尝试了以下代码:
variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
variable_columns = {
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperaturedewpoint', 'minTemperaturedewpoint', 'maxTemperaturedewpoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
for i, var_type in enumerate(variable_types):
columns = variable_columns[var_type]
print(columns)
fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
axes = axes.flat
data = METEO_COMPLETO[columns]
ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
print(ordered_columns)
data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[i].set_title(f'Distribución de {var_type}')
axes[i].set_xlabel('Valor')
axes[i].set_ylabel('Frecuencia')
axes[i].legend(ordered_columns)
plt.tight_layout()
plt.show()
另一个可能的选项是在同一图中以不同的颜色显示可用的测量值,以整体上看每个变量的最小值、平均值和最大值的分布。
英文:
I am trying to show the distribution of the columns of a dataset where I have these variables: ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
and from each one a measure, each of these measures is a column of the dataset:
{
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meantemperatureDewPoint', 'mintemperatureDewPoint', 'maxtemperatureDewPoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed'].
}
I want to plot them where each row of graphs is a variable and each column of graphs is a measurement.
I tried that:
variable_types = ['precip', 'pressureChange', 'pressureMeanSeaLevel', 'relativeHumidity', 'snow', 'temperature', 'temperatureDewPoint', 'temperatureFeelsLike', 'uvIndex', 'visibility', 'windDirection', 'windSpeed']
variable_columns = {
'precip': ['meanPrecip', 'minPrecip', 'maxPrecip'],
'pressureChange': ['meanPressurechange', 'minPressurechange', 'maxPressurechange'],
'pressureMeanSeaLevel': ['meanPressuremeansealevel', 'minPressuremeansealevel', 'maxPressuremeansealevel'],
'relativeHumidity': ['meanRelativehumidity', 'minRelativehumidity', 'maxRelativehumidity'],
'snow': ['meanSnow', 'minSnow', 'maxSnow'],
'temperature': ['meanTemperature', 'minTemperature', 'maxTemperature'],
'temperatureDewPoint': ['meanTemperaturedewpoint', 'minTemperaturedewpoint', 'maxTemperaturedewpoint'],
'temperatureFeelsLike': ['meanTemperaturefeelslike', 'minTemperaturefeelslike', 'maxTemperaturefeelslike'],
'uvIndex': ['modeUvindex', 'maxUvindex', 'minUvindex'],
'visibility': ['meanVisibility', 'minVisibility', 'maxVisibility'],
'windDirection': ['meanWinddirection'],
'windSpeed': ['meanWindspeed', 'minWindspeed', 'maxWindspeed']
}
for i, var_type in enumerate(variable_types):
columns = variable_columns[var_type]
print(columns)
fig, axes = plt.subplots(nrows=len(variable_types), ncols=len(columns), figsize=(5, 5))
axes = axes.flat
data = METEO_COMPLETO[columns]
ordered_columns = sorted(columns, key=lambda x: x.split('_')[0])
print(ordered_columns)
data[ordered_columns].hist(ax=axes[i], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[i].set_title(f'Distribución de {var_type}')
axes[i].set_xlabel('Valor')
axes[i].set_ylabel('Frecuencia')
axes[i].legend(ordered_columns)
plt.tight_layout()
plt.show()
But I got that result (last 4 variables):
Another possible option is to display the available measures in the same graph with different colors to see the distributions of the minimum, mean and maximum value of each variable as a whole.
答案1
得分: 1
对于我来说,通过嵌套两个for循环更容易保持这种图表布局的“网格思维”,一个处理图表的行(即“variable”),一个处理图表的列(我将其命名为“statistics”)。
然而,整个图表变得太大,无法得到良好的格式,所以我想这不是以单个图表的方式呈现数据的最佳方法。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
variable_columns = {
"precip": ["meanPrecip", "minPrecip", "maxPrecip"],
"pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
"pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
"relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
"snow": ["meanSnow", "minSnow", "maxSnow"],
"temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
"temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
"temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
"uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
"visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
"windDirection": ["meanWinddirection"],
"windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
}
# 生成虚假数据
meteo_data = {}
for cols in variable_columns.values():
for col in cols:
meteo_data[col] = np.random.randn(1000)
METEO_COMPLETO = pd.DataFrame(meteo_data)
# 统计列表以获得正确的图表网格和列名
statistics = ["max", "mean", "min"]
fig, axes = plt.subplots(
nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
)
# 循环遍历压缩的图表行和变量名
for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
# 循环遍历行中的特定图表和相应的统计名称
for ax, stat in zip(axes_row, statistics):
meteo_column = stat + variable.title() # 首字母大写
try:
data = METEO_COMPLETO[meteo_column]
except KeyError:
# 找不到统计数据(例如maxWinddirection)
# => 移除轴并进入下一个图表
ax.axis("off")
continue
ax.hist(data)
ax.set_title(f"分布 {variable}")
ax.set_xlabel("值")
ax.set_ylabel("频率")
ax.legend(col)
if i_row == 0:
ax.set_title(stat)
plt.tight_layout()
plt.show()
英文:
For me it is easier to keep the "grid-thought" of such a plot arrangement by nesting two for loops, one dealing with the rows of the plot (i.e. "variable"), and one dealing with the columns of the plot (i have named that "statistics").
However the whole plot gets too large to get nicely formatted, so I guess this is not the best way to present the data in a single plot.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
variable_columns = {
"precip": ["meanPrecip", "minPrecip", "maxPrecip"],
"pressureChange": ["meanPressurechange", "minPressurechange", "maxPressurechange"],
"pressureMeanSeaLevel": ["meanPressuremeansealevel", "minPressuremeansealevel", "maxPressuremeansealevel"],
"relativeHumidity": ["meanRelativehumidity", "minRelativehumidity", "maxRelativehumidity"],
"snow": ["meanSnow", "minSnow", "maxSnow"],
"temperature": ["meanTemperature", "minTemperature", "maxTemperature"],
"temperatureDewPoint": ["meanTemperaturedewpoint", "minTemperaturedewpoint", "maxTemperaturedewpoint"],
"temperatureFeelsLike": ["meanTemperaturefeelslike", "minTemperaturefeelslike", "maxTemperaturefeelslike"],
"uvIndex": ["meanUvindex", "maxUvindex", "minUvindex"],
"visibility": ["meanVisibility", "minVisibility", "maxVisibility"],
"windDirection": ["meanWinddirection"],
"windSpeed": ["meanWindspeed", "minWindspeed", "maxWindspeed"],
}
# generate fake data
meteo_data = {}
for cols in variable_columns.values():
for col in cols:
meteo_data[col] = np.random.randn(1000)
METEO_COMPLETO = pd.DataFrame(meteo_data)
# statistics list to get correct plot grid and column name
statistics = ["max", "mean", "min"]
fig, axes = plt.subplots(
nrows=len(variable_columns), ncols=len(statistics), figsize=(5, 5)
)
# loop over zipped plot rows and variable names
for i_row, (axes_row, variable) in enumerate(zip(axes, variable_columns.keys())):
# loop over zipped particular plots in row and respective statistic name
for ax, stat in zip(axes_row, statistics):
meteo_column = stat + variable.title() # capitalize first letter
try:
data = METEO_COMPLETO[meteo_column]
except KeyError:
# statistics not found (e.g. maxWinddirection
# => remove axis and go to next plot
ax.axis("off")
continue
ax.hist(data)
ax.set_title(f"Distribución de {variable}")
ax.set_xlabel("Valor")
ax.set_ylabel("Frecuencia")
ax.legend(col)
if i_row == 0:
ax.set_title(stat)
plt.tight_layout()
plt.show()
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论