import duckdbconn = duckdb.connect()mock_data_sql ="""SELECT * FROM ( VALUES (1, DATE '2024-07-01'), (1, DATE '2024-07-02'), (1, DATE '2024-07-03'), (1, DATE '2024-07-05'), (1, DATE '2024-07-06'), (2, DATE '2024-07-01'), (2, DATE '2024-07-02'), (2, DATE '2024-07-03'), (3, DATE '2024-07-01'), (3, DATE '2024-07-02'), (3, DATE '2024-07-03'), (3, DATE '2024-07-04'), (3, DATE '2024-07-05'), (3, DATE '2024-07-06'), (3, DATE '2024-07-07'), (3, DATE '2024-07-08'), (3, DATE '2024-07-09'), (4, DATE '2024-07-05'), (4, DATE '2024-07-06'), (4, DATE '2024-07-07'), (4, DATE '2024-07-08'), (4, DATE '2024-07-09'), ) AS t(source_id, end_date)"""df = conn.execute(mock_data_sql).df()df
group_date_sql =""" SELECT source_id, end_date, end_date - INTERVAL (ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY end_date) - 1) DAY AS group_date FROM df order by source_id, end_date"""grouped_date_df = conn.execute(group_date_sql).df()grouped_date_df
source_id
end_date
group_date
0
1
2024-07-01
2024-07-01
1
1
2024-07-02
2024-07-01
2
1
2024-07-03
2024-07-01
3
1
2024-07-05
2024-07-02
4
1
2024-07-06
2024-07-02
5
2
2024-07-01
2024-07-01
6
2
2024-07-02
2024-07-01
7
2
2024-07-03
2024-07-01
8
3
2024-07-01
2024-07-01
9
3
2024-07-02
2024-07-01
10
3
2024-07-03
2024-07-01
11
3
2024-07-04
2024-07-01
12
3
2024-07-05
2024-07-01
13
3
2024-07-06
2024-07-01
14
3
2024-07-07
2024-07-01
15
3
2024-07-08
2024-07-01
16
3
2024-07-09
2024-07-01
17
4
2024-07-05
2024-07-05
18
4
2024-07-06
2024-07-05
19
4
2024-07-07
2024-07-05
20
4
2024-07-08
2024-07-05
21
4
2024-07-09
2024-07-05
3. 根据 source_id 和分组时间分组
group_sql ="""SELECT source_id, MIN(end_date) AS start_date, MAX(end_date) AS end_date, FROM grouped_date_df GROUP BY source_id, group_date ORDER BY source_id, group_date"""grouped_df = conn.execute(group_sql).df()grouped_df
source_id
start_date
end_date
0
1
2024-07-01
2024-07-03
1
1
2024-07-05
2024-07-06
2
2
2024-07-01
2024-07-03
3
3
2024-07-01
2024-07-09
4
4
2024-07-05
2024-07-09
4. 可视化
代码
import plotly.express as pxfig = px.timeline(grouped_df, x_start='start_date', x_end='end_date', y='source_id')fig.update_yaxes(autorange="reversed")fig.show()
5. 完整代码
sql ="""with raw_data as ( SELECT * FROM ( VALUES (1, DATE '2024-07-01'), (1, DATE '2024-07-02'), (1, DATE '2024-07-03'), (1, DATE '2024-07-05'), (1, DATE '2024-07-06'), (2, DATE '2024-07-01'), (2, DATE '2024-07-02'), (2, DATE '2024-07-03'), (3, DATE '2024-07-01'), (3, DATE '2024-07-02'), (3, DATE '2024-07-03'), (3, DATE '2024-07-04'), (3, DATE '2024-07-05'), (3, DATE '2024-07-06'), (3, DATE '2024-07-07'), (3, DATE '2024-07-08'), (3, DATE '2024-07-09'), (4, DATE '2024-07-05'), (4, DATE '2024-07-06'), (4, DATE '2024-07-07'), (4, DATE '2024-07-08'), (4, DATE '2024-07-09'), ) AS t(source_id, end_date)), group_date as ( SELECT source_id, end_date, end_date - INTERVAL (ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY end_date) - 1) DAY AS group_date FROM raw_data order by source_id, end_date), final as ( SELECT source_id, MIN(end_date) AS start_date, MAX(end_date) AS end_date, FROM grouped_date_df GROUP BY source_id, group_date ORDER BY source_id, group_date)from final"""date_df = conn.execute(sql).df()gap_fig = px.timeline(date_df, x_start='start_date', x_end='end_date', y='source_id')gap_fig.update_yaxes(autorange="reversed")gap_fig.show()