-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_code.py
More file actions
174 lines (140 loc) · 6.4 KB
/
fetch_code.py
File metadata and controls
174 lines (140 loc) · 6.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""
Script to fetch code from either a single url or a csv of urls
a CSV file : fetch_code.py csv path_to_csv_file # csv file should contain the file_name,source_type,url triples
a single URL : fetch_code.py url file_name source_type url # handles a single url, must also provide file_name and source_type
Format for file_name, source_type:
file_name := full file name e.g. android.app.Service
source_type := android.googlesource | github
"""
import argparse
import requests
import base64
import sys
import os
from dotenv import load_dotenv
load_dotenv()
project_root = os.environ.get("PROJECT_ROOT", ".")
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Handle a single URL or a list of URLs in a CSV file.")
sub = parser.add_subparsers(dest="mode", required=True, help="Choose input mode")
# url mode
url_p = sub.add_parser("url", help="Process one URL")
url_p.add_argument("file_name", type=str, help="file_name - used to name the output file")
# url_p.add_argument("package_name", type=str, help="Package the file belongs to")
url_p.add_argument("source_type", type=str, help="source of the java code, options: 'android.googlesource', 'github'")
url_p.add_argument("url", type=str, help="Provide URL to process")
# csv mode
csv_p = sub.add_parser("csv", help="Process a CSV file of URLs")
csv_p.add_argument("path_to_csv_file", type=str, help="Path to the CSV file containing URLs")
return parser
def write_code_to_file(source_code, file_name, source_type, url):
file_path = create_output_file_path(file_name, base_dir=os.path.join(project_root, "java_source_code_files"))
try:
with open(file_path, 'w') as f:
# preprend metadata
metadata = (
f"// file_name : {file_name}\n"
f"// source_type : {source_type}\n"
f"// url : {url}\n\n"
)
f.write(metadata + source_code)
print(f"Successfully wrote source code from {file_name} to {file_path}.")
except (OSError, IOError) as e:
print(f"Error while writing source code to file: {e}")
print(f"File Details: {file_name}, {source_type}, {url}")
print("Moving onto next URL.")
def create_output_file_path(file_name, base_dir='.', ext="java"):
"""
Converts a dotted Java-style package name into a file path,
creates the directory structure if needed,
and returns the full path to the final file (not created).
Example:
android.app.Service → ./android/app/Service.java
"""
*dirs, class_name = file_name.split('.')
dir_path = os.path.join(base_dir, *dirs)
# create dirs if they don't already exist
os.makedirs(dir_path, exist_ok=True)
file_path = os.path.join(dir_path, f"{class_name}.{ext}")
return file_path
def fetch_code_from_single_url(file_name, source_type, url):
"""
Fetches code from a given URL based on the source type.
"""
print(f"Processing single URL: {url}")
print(f"File Details: {file_name}, {source_type}")
if source_type == "android.googlesource":
source_code = fetch_code_from_android_googlesource(url)
elif source_type == "github":
source_code = fetch_code_from_github(url)
else:
raise ValueError(f"Unsupported source type: {source_type}")
return source_code
def fetch_code_from_android_googlesource(url):
"""
Fetches and decodes raw Java source from android.googlesource.com.
"""
if '?format=TEXT' not in url:
url += '?format=TEXT'
response = requests.get(url, timeout=10)
if response.status_code != 200:
raise Exception(f"Failed to fetch source: {response.status_code}")
decoded = base64.b64decode(response.text).decode("utf-8")
return decoded
def fetch_code_from_github(url):
"""
Fetches and decodes raw Java source from GitHub. Converts regular GitHub URLs to raw URLs if needed.
"""
if "github.com" in url and "raw.githubusercontent.com" not in url:
url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
response = requests.get(url, timeout=10)
if response.status_code != 200:
raise Exception(f"Failed to fetch source: {response.status_code}")
return response.text
def main():
args = build_parser().parse_args()
if args.mode == "url":
source_code = fetch_code_from_single_url(args.file_name, args.source_type, args.url)
if len(source_code) > 0:
print("Fetched Java source code successfully.")
write_code_to_file(source_code, args.file_name, args.source_type, args.url)
else:
print("Error. No code fetched. Exiting.")
sys.exit(1)
elif args.mode == "csv":
print(f"Processing CSV file: {args.path_to_csv_file}")
if not os.path.exists(args.path_to_csv_file):
print(f"Error: The specified CSV file does not exist: {args.path_to_csv_file}")
sys.exit(1)
with open(args.path_to_csv_file, "r") as f:
for line in f:
# line := file_name,source_type,url
line = line.strip()
if not line:
continue
parts = line.split(',')
if len(parts) != 3:
print(f"Skipping malformed line: {line}")
print("="*50)
continue
try:
source_code = fetch_code_from_single_url(*parts)
except Exception as e:
print(e)
print(f"Skipping this line: {line}")
print("="*50)
continue
if len(source_code) > 0: # in casse the source code is empty
print("Fetched Java source code successfully.")
write_code_to_file(source_code, parts[0], parts[1], parts[2])
else:
print(f"Fetching code failed.\nSkipping line: {line}.")
print("="*50)
print("Code has been fetched from all URLs in the CSV file.")
if __name__ == "__main__":
main()
### Task List:
# - [ ] Update to work with the new phase based directory structure
# - [ ] Automatically write failed URLs to a separate CSV file along with the file_name, source_type
# like done in label_llm.py