-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigquery.py
103 lines (94 loc) · 2.2 KB
/
bigquery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# %%
import time
import pandas as pd
from google.cloud import bigquery
# Construct a BigQuery client object
client = bigquery.Client()
# %%
# banteg query
# Define the SQL query
# query = """
# SELECT actor.login, actor.id
# FROM `githubarchive.year.2018`
# WHERE actor.login IN ('rishu2403', 'tunnelpr0', 'pm-cs', 'evilebottnawi', 'wernf', 'remyabel')
# GROUP BY 1, 2
# """
# query_job = client.query(query)
# results = query_job.result() # Waits for the query to finish
# for row in results:
# print(f"Login: {row.login}, ID: {row.id}")
# %%
dataset_id = 'bigquery-public-data.github_repos'
tables = client.list_tables(dataset_id)
print(f"Tables contained in '{dataset_id}':")
for table in tables:
print(table.table_id)
# %%
start_time = time.time()
# `bigquery-public-data.github_repos.commits`
query = """
SELECT
*
FROM `githubarchive.month.202301`
WHERE type = 'Commit'
LIMIT 10
"""
query_job = client.query(query)
results = query_job.result()
headers = [header.name for header in results.schema]
df = results.to_dataframe()
display(df)
print(f"queried {len(df)} rows in {time.time() - start_time} seconds")
# %%
df.head(1).T
# %%
# query unique types
query = """
SELECT
type
FROM `githubarchive.month.202301`
GROUP BY 1
"""
query_job = client.query(query)
results = query_job.result()
headers = [header.name for header in results.schema]
df = results.to_dataframe()
display(df)
print(", ".join(df.type.values))
# %%
print(", ".join(df.type.values))
# %%
query = """
SELECT
actor.login,
repo.name
FROM
`githubarchive.year.{year}`
WHERE
type = 'PushEvent'
AND actor.login in {users}
GROUP BY 1, 2
"""
year = 2023
strk = pd.read_parquet('strk.parquet')
users = "('" + "', '".join(strk.identity.values.tolist()) + "')"
query_job = client.query(query.format(year=year, users=users))
results = query_job.result()
headers = [header.name for header in results.schema]
df = results.to_dataframe()
df.to_parquet(f"user_repos_{year}.parquet")
display(df)
# %%
query = """
SELECT
*
FROM
`bigquery-public-data.github_repos.commits`
LIMIT 1
"""
query_job = client.query(query)
results = query_job.result()
headers = [header.name for header in results.schema]
df = results.to_dataframe()
display(df)
# %%