-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsecond_layer.py
More file actions
387 lines (317 loc) · 16.7 KB
/
second_layer.py
File metadata and controls
387 lines (317 loc) · 16.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
from flask import jsonify
from scrape_website import save_to_md
from segmenter.segment import segment
from flask_cors import CORS
from code_from_visdoc.github_service import GitHubService
from code_from_visdoc.openai_service import OpenAIService
from code_from_visdoc.config import Config
from code_from_visdoc.utils import parse_openai_single_json
from code_from_visdoc.github_link_parser import parse_github_url
from segmenter.transformers_call import SentenceFeatureExtractor
import json
import copy
from datetime import datetime
import os, requests
from utils import save_llm_output
def process_md_and_wiki(topic, link, data, github_service, github_url_components, custom_file_path=None):
"""
Dummy function to process .md or .wiki links.
"""
print(f"Processing MD/WIKI link: {link}")
clean_link = link.split('#')[0].split('?')[0]
content_from_link = None
if clean_link.endswith(".md"):
print(f"Processing MD filepath: {github_url_components.filepath}")
content_from_link = get_new_nodes_from_md(link, github_service, github_url_components, custom_file_path=custom_file_path)
print("Fetching content_from_link successful.")
elif clean_link.endswith(".wiki"):
print(f"Processing wiki link: {link}")
content_from_link = get_new_nodes_from_wiki(link, github_service, github_url_components)
elif '/tree/' in clean_link and 'github.com' in clean_link:
print(f"Processing file inside another folder: {link}")
content_from_link = get_new_nodes_from_md(link, github_service, github_url_components, custom_file_path=custom_file_path)
else:
print("Link is outbound to other pages than current repository.")
return content_from_link
def get_new_nodes_from_md(link, github_service, github_url_components, custom_file_path=None):
'''
Get all the segments from the file like in app.py.
'''
owner = github_url_components.owner
repo = github_url_components.name
if custom_file_path:
file_path = custom_file_path
else:
file_path = github_url_components.filepath
print(owner, repo, file_path)
if not owner or not repo or not file_path:
return jsonify({"error": "The link doesn't contain one or many of these (owner, repo, file)."}), 400
openai_service = OpenAIService(Config.OPENAI_API_KEY, repo)
sentence_feature_extractor = SentenceFeatureExtractor()
files_and_contents = github_service.download_recursive(owner, repo, file_path)
if files_and_contents:
file_and_content = files_and_contents[0]
file_name = repo + '_' + file_and_content[0].split('/')[-1]
content = file_and_content[1]
md_file_path = save_to_md(content, file_name)
predicted_segmentation, segments, segmented_file_path = segment(sentence_feature_extractor, md_file_path, openai_service, file_name, segmentation_method='langchain', sentence_method= 'stanza', save_to_file=True, repo=repo, filename=file_path)
prompt_for_llm = 'PROMPT_FOR_SEQUENCING_SECOND_LAYER'
#Call the LLM to find the sequence
prompt = copy.deepcopy(openai_service.fetch_prompt(prompt_for_llm))
for item in prompt:
if item["role"] == "user":
item["content"] += str(segments)
with open('llm_prompt.txt', 'w') as outfile:
outfile.write(str(prompt))
# print(full_prompt_with_segments)
llm_result = parse_openai_single_json(openai_service.get_llm_response_json(prompt))
# print(f"\nActual response from API:\n {segments_flow_and_contents}")
return llm_result
else:
return None
def get_new_nodes_from_wiki(link, github_service, github_url_components):
'''
Get all the segments from the file like in app.py.
'''
pass
def add_second_layer_from_links(data, file_path, github_url_components):
"""
Post-process the links after they have been extracted.
1. If link ends with '.md' or '.wiki', call `process_md_and_wiki()`.
2. Otherwise (external link), add it to 'content' under a new topic
'Outbound Link #<counter>', and append a new edge to 'flow'.
"""
print("Current filepath = ", file_path)
github_service = GitHubService(Config.GITHUB_TOKEN)
outbound_link_counter = 1 # to label outbound links uniquely
#dictionary to cache already processed links and their contents
processed_links = {}
#If there are no links, the data would be the final result
merged_json = data
print("\n\n LINKS in the first file:")
print(len(data["links"]))
# Iterate over each topic and its list of extracted links
for topic, links_list in data.get("links", {}).items():
print("Second page started...")
print("Processing for topic: ", topic)
topic_links = []
for link in links_list:
# We check the portion before possible '#' or query params
# to see if it ends with .md or .wiki
clean_link = link.split('#')[0].split('?')[0]
if clean_link.endswith(".md") or clean_link.endswith(".wiki") or ('/tree/' in clean_link and 'github.com' in clean_link):
topic_links.append(clean_link)
else:
# Increment our counter for the next external link
outbound_link_counter += 1
#now process the unique links in each topic
uniq_topic_links = list(set(topic_links))
print("Links in the topic: ", uniq_topic_links)
#if there is a link to the same page(different element), remove that too.
if file_path in topic_links:
topic_links.remove(file_path)
for clean_link in uniq_topic_links:
print("Processing for link : ", clean_link)
content_from_link = None
if clean_link in processed_links: # If the link has already been processed
print(f"Reusing cached content for {clean_link}")
content_from_link = processed_links[clean_link]
print(content_from_link)
attach_second_layer(data, content_from_link, topic, clean_link)
else:
if clean_link.startswith(('http://', 'https://')):
# If the link is an absolute URL, add it directly
print("Absolute link found")
# github_url_components = parse_github_url(link)
# print(clean_link)
# # print(github_url_components)
# content_from_link = process_md_and_wiki(topic, clean_link, data, github_service, github_url_components)
#If the link takes us to a github tree, get the md files from the tree
if '/tree/' in clean_link and 'github.com' in clean_link:
#handle templates differently
if 'templates' in clean_link:
print("Processing templates")
template_directory = clean_link.split('/')[-1]
template_dir_url = get_directory_url(template_directory, github_url_components)
dir_paths = get_child_dirs(template_dir_url)
for dir in dir_paths:
directory_url = get_directory_url(dir, github_url_components)
file_paths = get_file_urls_from_dir(directory_url)
for file_pathe in file_paths:
if file_pathe.endswith(".md"):
print("Found a md file inside templates folder:", file_pathe)
# github_url_components.filepath = file_path
content_from_link = process_md_and_wiki(topic, clean_link, data, github_service, github_url_components, custom_file_path = file_pathe)
# print(content_from_link)
attach_second_layer(data, content_from_link, topic, file_pathe)
#handle docs (non-templates) differently
else:
print("Processing other docs")
directory = clean_link.split('/')[-1]
dir_url = get_directory_url(directory, github_url_components)
file_paths = get_file_urls_from_dir(dir_url)
for file_pathe in file_paths:
if file_pathe.endswith(".md"):
# github_url_components.filepath = file_path
content_from_link = process_md_and_wiki(topic, clean_link, data, github_service, github_url_components, custom_file_path = file_pathe)
# print(content_from_link)
attach_second_layer(data, content_from_link, topic, file_pathe)
else:
print("Relative link found")
# Otherwise, add it to relative links
new_link = github_service.create_new_filepath(file_path, clean_link)
github_url_components.filepath = new_link
print(new_link)
# print(github_url_components)
content_from_link = process_md_and_wiki(topic, new_link, data, github_service, github_url_components)
print(content_from_link)
# Store processed content in cache
processed_links[clean_link] = content_from_link
attach_second_layer(data, content_from_link, topic, clean_link)
return data
def rename_duplicate_topics(data, second_layer_nodes):
"""
Ensures no topics in second_layer_nodes['content'] duplicate each other
or any existing topics in data['content'].
If a collision is found, the duplicate second-layer topic is renamed with a #<counter> suffix.
This updates second_layer_nodes in place (both 'content' keys and 'flow' edges).
"""
# 1. Build a set of all existing topics from data
existing_topics = set(data["content"].keys())
# 2. Create a rename map for any second-layer topics
rename_map = {}
# We'll gather all second-layer topics in a list so we can iterate deterministically
second_layer_topic_names = list(second_layer_nodes["content"].keys())
for old_topic in second_layer_topic_names:
# If we've not processed this old_topic yet:
if old_topic not in rename_map:
if old_topic in existing_topics:
# We must generate a unique new topic name
i = 1
new_topic = f"{old_topic}#{i}"
while new_topic in existing_topics:
i += 1
new_topic = f"{old_topic}#{i}"
rename_map[old_topic] = new_topic
existing_topics.add(new_topic)
else:
# The topic is new overall; keep it as-is
rename_map[old_topic] = old_topic
existing_topics.add(old_topic)
# 3. Rename 'content' keys in second_layer_nodes
new_content = {}
for old_topic, text in second_layer_nodes["content"].items():
new_content[rename_map[old_topic]] = text
second_layer_nodes["content"] = new_content
# 4. Update edges in second_layer_nodes['flow'] to reflect any renamed topics
if "flow" in second_layer_nodes:
flow_value = second_layer_nodes["flow"]
# If it's a single dict, wrap it in a list to iterate
if isinstance(flow_value, dict):
flow_items = [flow_value]
else:
flow_items = flow_value
for flow_item in flow_items:
edges = flow_item.get("edges", [])
for edge in edges:
if edge["source"] in rename_map:
edge["source"] = rename_map[edge["source"]]
if edge["target"] in rename_map:
edge["target"] = rename_map[edge["target"]]
# Put them back if needed
if isinstance(flow_value, dict):
second_layer_nodes["flow"] = flow_items[0]
else:
second_layer_nodes["flow"] = flow_items
def attach_second_layer(data, second_layer_nodes, topic, link):
"""
Merges second_layer_nodes into data so that:
1) second_layer_nodes["content"] entries are merged into data["content"].
2) All new edges are placed in the flow item with the same sequence as the parent topic.
3) The 'first node' in second_layer_nodes is linked as a child of 'topic' in data.
4) No duplicate topics remain in the final structure (renamed if necessary).
"""
# A) Rename duplicates in second_layer_nodes to avoid conflicts
rename_duplicate_topics(data, second_layer_nodes)
# B) Merge the second-layer content into data
data["content"].update(second_layer_nodes.get("content", {}))
# C) Prepare second-layer flow as a list
flow_value = second_layer_nodes.get("flow", [])
if isinstance(flow_value, dict):
flow_items = [flow_value]
else:
flow_items = flow_value
if not flow_items:
# No flow to attach
print("Found no flow to attach.")
return data
print("Attaching flow from second layer")
# print("Flow from the file (second layer): ", flow_items)
# Identify the 'first node' in second_layer_nodes (assume it's the source of the first edge)
first_flow_item = flow_items[0]
first_edges = first_flow_item.get("edges", [])
if not first_edges:
print("Found no edges")
return data # No edges, nothing to attach
first_node = first_edges[0]["source"]
# D) Find the flow item in data with the same sequence as the parent topic
topic_sequence = None
parent_flow_item = None
for flow_item in data["flow"]:
if any(edge["source"] == topic or edge["target"] == topic for edge in flow_item["edges"]):
topic_sequence = flow_item.get("sequence")
parent_flow_item = flow_item
break # ← Ensure we stop once we find the correct sequence
if topic_sequence is None:
# If no matching sequence, default to a new sequence
topic_sequence = "Attached second layer"
# E) Ensure we have a parent flow item to append to
if not parent_flow_item:
# If no parent flow item matches, create a new one
parent_flow_item = {"edges": [], "sequence": topic_sequence}
data["flow"].append(parent_flow_item)
# F) Add the bridging edge (topic -> first_node) to the parent flow item
bridging_edge = {"source": topic, "target": first_node, "edge_label": link}
parent_flow_item["edges"].append(bridging_edge)
# G) **Fix: Append second-layer edges correctly**
label_added = False
for item in flow_items:
for edge in item.get("edges", []):
edge_with_label = {
"source": edge["source"],
"target": edge["target"],
}
if not label_added:
edge_with_label["edge_label"] = link # Add edge label to the first edge
label_added = True
parent_flow_item["edges"].append(edge_with_label) # **Fix: Append the updated edge**
# H) Save the updated data to a file (optional)
# saved_filename = "second_layer_merged_data" + link + ".json"
# with open("second_layer_merged_data.json", "w") as f:
# json.dump(data, f, indent=4)
file_prefix = "second_layer_merged_data" + topic
save_llm_output(data, file_prefix)
return data
def get_directory_url(dir, github_url_components):
base_url = 'https://api.github.com/repos/'
base_url = base_url + github_url_components.owner + '/' + github_url_components.name + '/contents/'
file_url = base_url + dir
return file_url
def get_file_urls_from_dir(dir_url):
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(dir_url, headers=headers)
if response.status_code == 200:
files = [file["path"] for file in response.json() if file["type"]=="file"]
print("Files in the folder:", files)
else:
print("Failed to retrieve files. Status Code:", response.status_code, response.json())
return files
def get_child_dirs(dir_url):
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(dir_url, headers=headers)
if response.status_code == 200:
dirs = [file["path"] for file in response.json() if file["type"]=="dir"]
print("Children directories in the templates folder:", dirs)
else:
print("Failed to retrieve files. Status Code:", response.status_code, response.json())
return dirs