英文:
Getting rows that share the same - latest - timestamp
问题
-- 1. 找到过去15分钟内每分钟的最后一条遥测数据
; WITH DesiredDevices AS (
SELECT
DeviceId, Name
FROM MyDevice
WHERE DeviceId IN (1,2,3,4,5) -- 在实际情况中可能更复杂
)
, per_minute AS (
SELECT
row_number() OVER (PARTITION BY t.DeviceId, DATEDIFF(MINUTE, 0, TelemetryAtUtc ) ORDER BY TelemetryAtUtc DESC) AS minute_row_number,
(DATEDIFF(MINUTE, 0, TelemetryAtUtc)) AS TelemetryAtUtcMinute,
t.DeviceId,
TelemetryAtUtc,
V
FROM MyTelemetry t
JOIN DesiredDevices d ON d.DeviceId = t.DeviceId
WHERE TelemetryAtUtc > @FromUtc
)
, one_per_minute AS (SELECT * FROM per_minute T WHERE minute_row_number = 1)
, minute_count AS (
SELECT
TelemetryAtUtcMinute,
COUNT(*) AS NoOfTelems
FROM one_per_minute d
GROUP BY TelemetryAtUtcMinute
)
, X AS
(
SELECT
d.DeviceId,
d.Name,
t.TelemetryAtUtc,
t.V,
t.TelemetryAtUtcMinute,
mc.NoOfTelems
, ROW_NUMBER() OVER (PARTITION BY d.DeviceId ORDER BY NoOfTelems DESC, t.TelemetryAtUtcMinute DESC) AS rnx
FROM DesiredDevices d
LEFT JOIN one_per_minute t ON d.DeviceId = t.DeviceId
LEFT JOIN minute_count mc ON mc.TelemetryAtUtcMinute = t.TelemetryAtUtcMinute
)
SELECT *
FROM X
WHERE rnx = 1
ORDER BY TelemetryAtUtcMinute DESC
这是你提供的SQL查询的翻译。
英文:
Having:
CREATE TABLE [MyDevice](
[DeviceId] [int] NOT NULL,
[Name] [nvarchar](50) NOT NULL
)
DROP TABLE IF EXISTS [MyTelemetry]
CREATE TABLE [MyTelemetry](
[DeviceId] [int] NOT NULL,
[TelemetryAtUtc] [datetime2](3) NOT NULL,
[V] [numeric](8, 3) NULL,
)
INSERT INTO [MyDevice] ([DeviceId], [Name]) VALUES
(0, 'Device 0'),
(1, 'Device 1'),
(2, 'Device 2'),
(3, 'Device 3'),
(4, 'Device 4'),
(5, 'Device 5')
INSERT INTO [MyTelemetry] ([DeviceId], [TelemetryAtUtc], [V]) VALUES
(0, '2023-06-19 05:11:00', 0.11),
(0, '2023-06-19 05:12:00', 0.12),
(0, '2023-06-19 05:13:00', 0.13),
(1, '2023-06-19 05:11:00', 1.11),
(1, '2023-06-19 05:12:00', 1.12),
(1, '2023-06-19 05:12:30', 1.123), -- It's not common but there cane be multiple telemetry records per device per minute
(1, '2023-06-19 05:13:00', 1.13),
(2, '2023-06-19 05:11:00', 2.11),
(2, '2023-06-19 05:12:00', 2.12),
(2, '2023-06-19 05:13:00', 2.13),
(3, '2023-06-19 05:11:00', 3.11),
(3, '2023-06-19 05:12:00', 3.12), -- Device 3 doesn't have data for minute 05:13:00
-- Extra test data
(4, '2023-06-19 05:10:00', 4.10),
(5, '2023-06-19 05:15:00', 5.15)
-- The table has 1440 (number of minutes in 24h) * tens_of_thousands records
I need to retrieve data for devices meeting some criteria, but for a minute where there is telemetry for all of them.
Segmented by a minute the data looks like this:
TheMinute | d0 | d1 | d2 | d3 | d4 | d5 | Comment |
---|---|---|---|---|---|---|---|
2023-06-19 00:10:00 | * | Only d4 has data for this minute | |||||
2023-06-19 00:11:00 | * | * | * | * | d0-d3 have data | ||
2023-06-19 00:12:00 | * | ** | * | * | d0-d3 have data; d1 has 2 data points | ||
2023-06-19 00:13:00 | * | * | * | ||||
2023-06-19 00:14:00 | * |
Case 1. D1,D2,D3
When asking for devices (1,2,3) the desired rows are from minute 2023-06-19 5:12
because the more recent minute that has telemetry for d1, d2 and d3 is 5:12, d1 has two rows for this minute and the more recent one is returned:
Id Name TelemetryAtUtc V
1 Device 1 2023-06-19 05:12:30.000 1.123
2 Device 2 2023-06-19 05:12:00.000 2.12
3 Device 3 2023-06-19 05:12:00.000 3.12
Case 2. D1,D2,D3,D4,D5
There is no minute that has data for all 5 devices, which means I won't be using the telemetry data in my code.
I still need to return row for all devices (for other - non-telemetry - data about them) and I don't mind if TelemetryAtUtc
and V
have values or are nulls.
In the code which consumes the result of the SQL I check TelemetryAtUtc
and I don't use the V
if TelemetryAtUtc
is not the same for all records.
For example, this would be fine result:
Id Name TelemetryAtUtc V
5 Device 5 2023-06-19 05:15:00.000 5.15
1 Device 1 2023-06-19 05:12:30.000 1.123
2 Device 2 2023-06-19 05:12:00.000 2.12
3 Device 3 2023-06-19 05:12:00.000 3.12
4 Device 4 2023-06-19 05:10:00.000 4.1
Similarly, in case of ties - e.g. when 3 devices have telemetry at 5:11 and 3 devices have telemetry at 5:12 - I also don't mind the result as long as I can tell that there is no minute with telemetries for all devices.
The Question
Is there a performant way to achieve this?
I came up with the following (fiddle is http://sqlfiddle.com/#!18/51c85/1):
declare @FromUtc DATETIME2 = '2023-06-19 05:09:00' -- in real life it's DATEADD(MINUTE, -15, GETUTCDATE());
-- 1. Find the last telemetries for each minute in the last 15 minutes
; WITH DesiredDevices AS (
SELECT
DeviceId, Name
FROM MyDevice
WHERE DeviceId IN (1,2,3,4,5) -- more complex in real life
)
, per_minute AS (
SELECT
row_number() OVER (PARTITION BY t.DeviceId, DATEDIFF(MINUTE, 0, TelemetryAtUtc ) ORDER BY TelemetryAtUtc DESC) AS minute_row_number,
(DATEDIFF(MINUTE, 0, TelemetryAtUtc)) AS TelemetryAtUtcMinute,
t.DeviceId,
TelemetryAtUtc,
V
FROM MyTelemetry t
join DesiredDevices d ON d.DeviceId = t.DeviceId
WHERE TelemetryAtUtc > @FromUtc
)
, one_per_minute AS (SELECT * FROM per_minute T WHERE minute_row_number = 1)
, minute_count AS (
SELECT
TelemetryAtUtcMinute,
Count(*) AS NoOfTelems
FROM one_per_minute d
GROUP BY TelemetryAtUtcMinute
)
, X AS
(
SELECT
d.DeviceId,
d.Name,
t.TelemetryAtUtc,
t.V,
t.TelemetryAtUtcMinute,
mc.NoOfTelems
, row_number() OVER (PARTITION BY d.DeviceId ORDER BY NoOfTelems desc, t.TelemetryAtUtcMinute DESC) AS rnx
FROM DesiredDevices d
LEFT JOIN one_per_minute t ON d.DeviceId = t.DeviceId
LEFT JOIN minute_count mc ON mc.TelemetryAtUtcMinute = t.TelemetryAtUtcMinute
)
SELECT *
FROM X
WHERE rnx = 1
ORDER by TelemetryAtUtcMinute DESC
which results in an acceptable result:
DeviceId Name TelemetryAtUtc V TelemetryAtUtcMinute NoOfTelems rnx
5 Device 5 2023-06-19 05:15:00.000 5.15 64935675 1 1
1 Device 1 2023-06-19 05:12:30.000 1.123 64935672 3 1
2 Device 2 2023-06-19 05:12:00.000 2.12 64935672 3 1
3 Device 3 2023-06-19 05:12:00.000 3.12 64935672 3 1
4 Device 4 2023-06-19 05:10:00.000 4.1 64935670 1 1
答案1
得分: 1
1. 在这里,您可以添加设备日期的选择。
2. 此部分确保每分钟只有一个设备,以确保后续每分钟设备计算不会受到影响。
3. 这创建了一个每分钟的计数器。
4. 按计数器和最后的遥测日期的顺序获取第一行。这确保我们优先处理具有最多计算的分钟。WITH TIES 是一个技巧,只获取每个设备的第一行。
英文:
You should be able to do what you did inline:
SELECT TOP 1 WITH TIES * -- 4.
FROM (
SELECT count(*) OVER(partition BY cast(telemetryAtutc AS date), datepart(hour, telemetryatutc), datepart(minute, telemetryatutc)) AS cnt -- 3. count per minute
, *
FROM (
SELECT deviceid, max(telemetryatutc) AS telemetryatutc -- 2. collapse multiple times per device
FROM MyTelemetry
WHERE DeviceId IN (1,2,3,4,5) -- 1. add selections here
GROUP BY deviceid, cast(telemetryAtutc AS date), datepart(hour, telemetryatutc), datepart(minute, telemetryatutc)
) x
) x
ORDER BY ROW_NUMBER() OVER(partition BY deviceid ORDER BY cnt DESC, telemetryatutc DESC) -- 4.
- Here you add your selections on devices dates
- This part makes sure there's only one device per minute, so they don't skewer the calculation of devices per minute later
- This creates a per minute counter
- The order by gets first row based on the counter first and the last telemetry date second. This ensures we prioritize the minutes with most calculations. WITH TIES is a trick which fetches first row only per deviceId
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论