File size: 4,103 Bytes
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Module which updates any of the issues to reflect changes in the issue state
"""
import json
import datetime
from defaults import TOKEN, OWNER, REPO

GITHUB_API_VERSION = "2022-11-28"



# Get the issues that have been updated since the last update

import json

import argparse

import requests
import os
import numpy as np
import json
import datetime
import logging

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)

today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

OWNER = "huggingface"
REPO = "transformers"
GITHUB_API_VERSION = "2022-11-28"
TOKEN = os.environ.get("GITHUB_TOKEN")
JSON_FILE = f"issues.json"


def get_issues(
    input_filename=JSON_FILE,
    output_filename=JSON_FILE,
    github_api_version=GITHUB_API_VERSION,
    owner=OWNER,
    repo=REPO,
    token=TOKEN,
    n_pages=-1,
):
    """
    Function to get the issues from the transformers repo and save them to a json file
    """
    with open("issues_dict.json", "r") as f:
        issues = json.load(f)

    # Get most recent updated at information
    updated_at = [issue["updated_at"] for issue in issues.values()]
    most_recent = max(updated_at)

    # If file exists and we want to overwrite it, delete it
    if not os.path.exists(output_filename):
        raise ValueError(f"File {output_filename} does not exist")

    # Define the URL and headers
    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
    headers = {
        "Accept": "application/vnd.github+json",
        f"Authorization": f"{token}",
        "X-GitHub-Api-Version": f"{github_api_version}",
        "User-Agent": "amyeroberts",
    }
    per_page = 100
    page = 1
    query_params = {
        "state": "all",
        "since": "2024-02-01T11:33:35Z",
        # "since": most_recent,
        "sort": "created",
        "direction": "asc",
        "page": page,
    }

    new_lines = []

    page_limit = (n_pages + page) if n_pages > 0 else np.inf
    while True:
        if page >= page_limit:
            break

        # Send the GET request
        response = requests.get(url, headers=headers, params=query_params)

        if not response.status_code == 200:
            raise ValueError(
                f"Request failed with status code {response.status_code} and message {response.text}"
            )

        json_response = response.json()
        logger.info(f"Page: {page}, number of issues: {len(json_response)}")

        # If we get an empty response, we've reached the end of the issues
        if len(json_response) == 0:
            break

        new_lines.extend(json_response)

        # If we get less than the number of issues per page, we've reached the end of the issues
        if len(json_response) < per_page:
            break

        page += 1
        query_params["page"] = page

    issue_lines_map = {issue["number"]: issue for issue in new_lines}

    with open(input_filename, "r") as f:
        with open("tmp_" + output_filename, "a") as g:
            for line in f:
                issue = json.loads(line)
                number = issue["number"]
                if number in issue_lines_map:
                    g.write(json.dumps(issue_lines_map[number]))
                    g.write("\n")
                else:
                    g.write(line)

    os.rename("tmp_" + output_filename, output_filename)

    with open("updated_issues.json", "w") as f:
        json.dump(issue_lines_map, f, indent=4, sort_keys=True)

    return output_filename


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_filename", type=str, default=JSON_FILE)
    parser.add_argument("--output_filename", type=str, default=JSON_FILE)
    parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
    parser.add_argument("--owner", type=str, default=OWNER)
    parser.add_argument("--repo", type=str, default=REPO)
    parser.add_argument("--token", type=str, default=TOKEN)
    parser.add_argument("--n_pages", type=int, default=-1)
    args = parser.parse_args()
    get_issues(**vars(args))