-
Notifications
You must be signed in to change notification settings - Fork 4
/
github.py
77 lines (62 loc) · 2.58 KB
/
github.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""The Python script retrieves the 100 most starred public repositories from the
GitHub API, extracts relevant data, and inserts it into a BigQuery table.
It requires the requests, google-cloud-bigquery, and google-auth libraries to be
installed. The script can be run with a service account key stored
in /home/service.json."""
from datetime import datetime
import requests
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import exceptions as bq_exce
# Make a request to the GitHub API to get the 100 most rated public repositories
response = requests.get(
"https://api.github.com/search/repositories?q=stars:>0&sort=stars&per_page=100",
timeout=10
)
# Convert the response data to JSON format
data = response.json()
# Create a list to store the repository data
repos_data = []
# Loop through each repository in the response data
for repo in data["items"]:
# Extract the repository data we want
name = repo["name"]
description = repo["description"]
stars = repo["stargazers_count"]
forks = repo["forks_count"]
language = repo["language"]
created_date = datetime.strptime(repo["created_at"], "%Y-%m-%dT%H:%M:%SZ").date()
# Add the repository data to the list
repos_data.append((name, description, stars, forks, language, created_date))
# Initialize the BigQuery client
credentials = service_account.Credentials.from_service_account_file(
"/home/service.json"
)
client = bigquery.Client(credentials=credentials, project="clean-phone-438712")
# Define the BigQuery table schema
schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("description", "STRING"),
bigquery.SchemaField("stars", "INTEGER"),
bigquery.SchemaField("forks", "INTEGER"),
bigquery.SchemaField("language", "STRING"),
bigquery.SchemaField("created_date", "DATE"),
]
# Define the BigQuery table reference
table_ref = client.dataset("earthquake").table("table_m")
# Create the BigQuery table if it doesn't exist
try:
client.get_table(table_ref)
except bq_exce.NotFound:
client.create_table(bigquery.Table(table_ref, schema=schema))
# Insert the repository data into the BigQuery table
table = client.get_table(table_ref)
rows_to_insert = [
list(repo_data[0:5]) + [repo_data[5].strftime("%Y-%m-%d")]
for repo_data in repos_data
]
errors = client.insert_rows(table, rows_to_insert)
if not errors:
print("Data inserted into BigQuery successfully!")
else:
print(f"Errors occurred while inserting data into BigQuery: {errors}")