SQL 填充每组中的缺失值为平均值

huangapple go评论68阅读模式
英文:

SQL to fill missing value by AVG in each Group

问题

Step 2 填充缺失的日期和价格是通过以下方式完成的,只计算相邻的一个较小值和一个较大值的平均值,而不是计算所有行的平均值:

SELECT d.CreateDate,
COALESCE(Price, 
    AVG(CASE WHEN m.Price IS NOT NULL THEN m.Price END) OVER (ORDER BY d.CreateDate ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
) as New_Price,
m.*
From #DateRange d
LEFT OUTER JOIN #mytable m on d.CreateDate = m.CreateDate

这将为每个产品组计算相邻的一个较小值和一个较大值的平均价格,而无需显式设置产品为'BB'。

英文:

I want to fill missing date and Price with Average for below table

SQL 填充每组中的缺失值为平均值

Sample data

if OBJECT_ID('tempdb..#mytable') is not null DROP TABLE #mytable
CREATE TABLE #mytable (CreateDate datetime, Product Varchar(4), Price money)
INSERT INTO #mytable VALUES
('2023-06-09 17:01:00.000','Tree',1),
('2023-06-09 17:02:00.000','Tree',2),
('2023-06-09 17:03:00.000','Tree',3),
('2023-06-09 17:04:00.000','Tree',4),
('2023-06-09 17:01:00.000','BB',20),
('2023-06-09 17:02:00.000','BB',40),
('2023-06-09 17:04:00.000','BB',60),
('2023-06-09 17:01:00.000','Car',20),
('2023-06-09 17:03:00.000','Car',30),
('2023-06-09 17:04:00.000','Car',50)

Step 1 is to create a complete Datetime set

if OBJECT_ID('tempdb..#DateRange') is not null DROP TABLE #DateRange
Create Table #DateRange(CreateDate datetime Primary key Clustered)
GO

Declare @startdate datetime = '2023-06-09 17:01:00', @endtime datetime = '2023-06-09 17:04:00'
While (@startdate <= @endtime)
BEGIN
Insert into #DateRange values (@startdate)
set @startdate = DATEADD(MINUTE, 1, @startdate)
END

Step 2 to fill NULL with average of Only ONE upper and Only ONE lower number , not AVG of all rows

SELECT d.CreateDate,
COALESCE(Price, AVG(Price) OVER ()) as New_Price,
m.*
From #DateRange d
LEFT OUTER JOIN #mytable m on d.CreateDate = m.CreateDate
and Product = 'BB'

My question is how loop through each Product Group so I don't need to set product = BB explicitly?

My expected result is

SQL 填充每组中的缺失值为平均值

答案1

得分: 2

尝试以下(如果您正在使用 SQL Server 2022):

declare @start_time datetime = '2023-06-09 17:01:00';
declare @end_time datetime = '2023-06-09 17:04:00';

with Calendar as 
(-- 步骤1:根据起始时间和结束时间构建日历表。
  select top(datediff(minute, @start_time, @end_time) +1)
         dateadd(minute, row_number() over (order by number) -1, @start_time) dt
  from master..spt_values
)
select all_dts.dt as CreateDate,
       p.product,
       -- 使用 first_value 和 last_value 函数以及 (ignore nulls) 选项获取下一个和前一个非空价格值。
       isnull(tbl.price, 
              (isnull(first_value(tbl.price) ignore nulls over (partition by p.product order by all_dts.dt range between current row and unbounded following), 0) +
              isnull(last_value(tbl.price) ignore nulls over (partition by p.product order by all_dts.dt range between unbounded preceding and current row), 0)) /2.0
            ) as price 
from Calendar all_dts
-- 步骤2:在步骤1生成的日期和产品的不同值之间执行交叉连接,以获取所有可能的组合(日期和产品)
cross join (select distinct product from #mytable) p
-- 步骤3:与表进行左连接,以获取缺失的日期
left join #mytable tbl
on all_dts.dt = tbl.CreateDate and
   p.product = tbl.product
order by p.product, all_dts.dt

对于 SQL Server 的早期版本,我们可以使用子查询代替 first_valuelast_value 函数以及 ignore null

declare @start_time datetime = '2023-06-09 17:01:00';
declare @end_time datetime = '2023-06-09 17:04:00';

with Calendar as 
(-- 步骤1:根据起始时间和结束时间构建日历表。
  select top(datediff(minute, @start_time, @end_time) +1)
         dateadd(minute, row_number() over (order by number) -1, @start_time) dt
  from master..spt_values
)
select all_dts.dt as CreateDate,
       p.product,
       -- 使用子查询获取下一个和前一个非空价格值。
       isnull(tbl.price, 
              (isnull((select top 1 price from #mytable mtbl where mtbl.product = p.product and mtbl.CreateDate > all_dts.dt order by CreateDate), 0) +
               isnull((select top 1 price from #mytable mtbl where mtbl.product = p.product and mtbl.CreateDate < all_dts.dt order by CreateDate desc), 0)) /2.0
            ) as price 
from Calendar all_dts
-- 步骤2:在步骤1生成的日期和产品的不同值之间执行交叉连接,以获取所有可能的组合(日期和产品)
cross join (select distinct product from #mytable) p
-- 步骤3:与表进行左连接,以获取缺失的日期
left join #mytable tbl
on all_dts.dt = tbl.CreateDate and
   p.product = tbl.product
order by p.product, all_dts.dt
英文:

Try the following (if you are using SQL Server 2022):

declare @start_time datetime = &#39;2023-06-09 17:01:00&#39;;
declare @end_time datetime = &#39;2023-06-09 17:04:00&#39;;

with Calendar as 
(-- step1: build a calender table based on your start and end time.
  select top(datediff(minute, @start_time, @end_time) +1)
         dateadd(minute, row_number() over (order by number) -1, @start_time) dt
  from master..spt_values
)
select all_dts.dt as CreateDate,
       p.product,
       -- use first_value and last_value functions with the (ignore nulls) option to get the next and previous not null price value.
       isnull(tbl.price, 
              (isnull(first_value(tbl.price) ignore nulls over (partition by p.product order by all_dts.dt range between current row and unbounded following), 0) +
              isnull(last_value(tbl.price) ignore nulls over (partition by p.product order by all_dts.dt range between unbounded preceding and current row), 0)) /2.0
            ) as price 
from Calendar all_dts
-- step2: do a cross join between the generated dates in step1 and the distinct values of products to get all possisble combinations of (dates and products)
cross join (select distinct product from #mytable) p
-- step3: do a left join with the table to get the missing dates
left join #mytable tbl
on all_dts.dt = tbl.CreateDate and
   p.product = tbl.product
order by p.product, all_dts.dt

demo

For previous versions of the SQL server, we could implement a subquery instead of the first_value and last_value functions with the ignore null:

declare @start_time datetime = &#39;2023-06-09 17:01:00&#39;;
declare @end_time datetime = &#39;2023-06-09 17:04:00&#39;;

with Calendar as 
(-- step1: build a calender table based on your start and end time.
  select top(datediff(minute, @start_time, @end_time) +1)
         dateadd(minute, row_number() over (order by number) -1, @start_time) dt
  from master..spt_values
)
select all_dts.dt as CreateDate,
       p.product,
       -- Use subquery to get the next and previous not null price value.
       isnull(tbl.price, 
              (isnull((select top 1 price from #mytable mtbl where mtbl.product = p.product and mtbl.CreateDate &gt; all_dts.dt order by CreateDate), 0) +
               isnull((select top 1 price from #mytable mtbl where mtbl.product = p.product and mtbl.CreateDate &lt; all_dts.dt order by CreateDate desc), 0)) /2.0
            ) as price 
from Calendar all_dts
-- step2: do a cross join between the generated dates in step1 and the distinct values of products to get all possisble combinations of (dates and products)
cross join (select distinct product from #mytable) p
-- step3: do a left join with the table to get the missing dates
left join #mytable tbl
on all_dts.dt = tbl.CreateDate and
   p.product = tbl.product
order by p.product, all_dts.dt

demo

答案2

得分: 1

以下是您要翻译的内容:

你可以使用一个公用表表达式(CTE)。
首先创建日期列表和创建独特产品列表。

然后,将日期独特产品列表与名为mytable的表连接,以查找日期之间的间隙。

声明 @startdate datetime = (选择 #mytable 中的最小(CreateDate))
, @endtime datetime = (选择 #mytable 中的最大(CreateDate))

;使用 Listunique 作为 ( 
选择   distinct Product
 #mytable 
),

_List (date_,Product) 作为 ( 
		选择 @startdate 作为 date_,Product
		 (选择 distinct Product   Listunique)a
		union all
		选择  DATEADD(minute,1, date_) 作为 date_,Product
		 _List
		其中 date_ < @endtime
)
选择 a.*,f.price
 _List a

full join #mytable b on a.date_=b.CreateDate  a.Product=b.Product
outer apply (
				选择 sum(ISNULL(p,0) +ISNULL(n,0)) /2 作为 price
				 (
						选择  Price 作为 p,0 作为 n  #mytable l 
						其中 l.Product=a.Product  l.CreateDate=DATEADD(minute,-1, a.date_)
						union 
						选择 0 作为 p,Price 作为 n  #mytable l 
						其中 l.Product=a.Product  l.CreateDate=DATEADD(minute,1, a.date_)
		        )d
		    )f
其中 b.CreateDate is null

演示

英文:

You can use a CTE.
First create list date and create list unique Product

Then join List Date_uniqueProduct with table mytable for find gap with date

Declare @startdate datetime = (select  min(CreateDate) from #mytable)
, @endtime datetime =(select  max(CreateDate) from #mytable)


;with Listunique as (
select   distinct Product
from #mytable
),

_List (date_,Product) as ( 
		select @startdate as date_,Product
		from (select distinct Product from  Listunique)a
		union all
		select  DATEADD(minute,1, date_) as date_,Product
		from _List
		where date_ &lt;@endtime

)
select a.*,f.price
from _List a

full join #mytable b on a.date_=b.CreateDate and a.Product=b.Product
outer apply (
				select sum(ISNULL(p,0) +ISNULL(n,0)) /2 as price
				from (
						select  Price as p,0 as n from #mytable l 
						where l.Product=a.Product and l.CreateDate=DATEADD(minute,-1, a.date_)
						union 
						select 0 as p,Price as n from #mytable l 
						where l.Product=a.Product and l.CreateDate=DATEADD(minute,1, a.date_)
		        )d
		    )f
where b.CreateDate is null

Demo

答案3

得分: 1

使用GENERATE_SERIES创建一个日期范围,将这个日期列表与表连接以识别缺失的值,然后使用每个组的第一个和最后一个记录的平均值(使用FIRST_VALUELAST_VALUE)填充缺失的值:

Declare @startdate datetime = (select min(CreateDate) from #mytable)
,@endtime datetime =(select max(CreateDate) from #mytable);

with daterange as (
  SELECT DATEADD(minute,value,@startdate) as CreateDate, Product
  FROM GENERATE_SERIES(datepart(minute, @startdate) - 1, 
                     datepart(minute, @endtime) - 1,1)
  CROSS JOIN (SELECT DISTINCT Product FROM #mytable) p
),
cte as (
  SELECT d.*, t.Price
  FROM daterange d
  LEFT JOIN #mytable t on d.CreateDate = t.CreateDate AND d.Product = t.Product
),
avg_cte as (
  select *,(COALESCE(first_value(price) ignore nulls over (partition by product order by CreateDate range between current row and unbounded following), 0) +
           COALESCE(last_value(price) ignore nulls over (partition by product order by CreateDate range between unbounded preceding and current row), 0)) /2.0 as avg_price
  from cte
)
select CreateDate, Product, avg_price
from avg_cte
where price is null

结果:

CreateDate	            Product	avg_price
2023-06-09 17:03:00.000	BB	    50.0000
2023-06-09 17:02:00.000	Car	    25.0000

演示链接

英文:

Use GENERATE_SERIES to create a date range, join this list of dates to the table to identify the missing values, and then fill in the missing values using the average of only the first and last record of each group using (FIRST_VALUE and LAST_VALUE):

Declare @startdate datetime = (select min(CreateDate) from #mytable)
,@endtime datetime =(select max(CreateDate) from #mytable);

with daterange as (
  SELECT DATEADD(minute,value,@startdate) as CreateDate, Product
  FROM GENERATE_SERIES(datepart(minute, @startdate) - 1, 
                     datepart(minute, @endtime) - 1,1)
  CROSS JOIN (SELECT DISTINCT Product FROM #mytable) p
),
cte as (
  SELECT d.*, t.Price
  FROM daterange d
  LEFT JOIN #mytable t on d.CreateDate = t.CreateDate AND d.Product = t.Product
),
avg_cte as (
  select *,(COALESCE(first_value(price) ignore nulls over (partition by product order by CreateDate range between current row and unbounded following), 0) +
           COALESCE(last_value(price) ignore nulls over (partition by product order by CreateDate range between unbounded preceding and current row), 0)) /2.0 as avg_price
  from cte
)
select CreateDate, Product, avg_price
from avg_cte
where price is null

Result :

CreateDate	            Product	Price
2023-06-09 17:03:00.000	BB	    50.0000
2023-06-09 17:02:00.000	Car	    25.0000

Demo here

huangapple
  • 本文由 发表于 2023年6月12日 06:24:18
  • 转载请务必保留本文链接:https://go.coder-hub.com/76452746.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定