-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcategorize_all.py
executable file
·217 lines (179 loc) · 9.24 KB
/
categorize_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python3
"""
Script to categorize all repositories using the LLM categorizer.
This is a standalone script that can be run to categorize all repositories
without running the full update process.
"""
import os
import json
import argparse
import time
from pathlib import Path
from dotenv import load_dotenv
from scripts.llm_categorizer import LLMCategorizer
def main():
"""Main function to categorize all repositories"""
parser = argparse.ArgumentParser(description="Categorize all repositories using LLM")
parser.add_argument("--min-confidence", type=int, default=80,
help="Minimum confidence score (0-100)")
parser.add_argument("--apply", action="store_true",
help="Apply changes to category files")
parser.add_argument("--batch-size", type=int, default=10,
help="Number of repositories to process in each batch")
parser.add_argument("--start-from", type=str, default=None,
help="Start processing from this repository name")
parser.add_argument("--delay", type=int, default=2,
help="Delay in seconds between API calls to avoid rate limiting")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be done without making API calls")
parser.add_argument("--force", action="store_true",
help="Force categorization even for already categorized repositories")
parser.add_argument("--max-repos", type=int, default=None,
help="Maximum number of repositories to process (useful for testing)")
parser.add_argument("--resume-from-last", action="store_true",
help="Resume from the last processed repository based on logs")
parser.add_argument("--consolidate", action="store_true",
help="Consolidate similar categories after categorization")
parser.add_argument("--only-consolidate", action="store_true",
help="Only consolidate categories without categorizing repositories")
parser.add_argument("--no-confirm", action="store_true", default=True,
help="Skip confirmation prompt (default: True)")
args = parser.parse_args()
# Load environment variables
load_dotenv()
# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY") and not args.dry_run:
print("Error: OPENAI_API_KEY environment variable not set.")
print("Please set this in your .env file or environment.")
return 1
# Load repository data
base_path = Path(__file__).parent
json_path = base_path / "data" / "exports" / "repo-index.json"
try:
with open(json_path, "r") as f:
repos_data = json.load(f)
except Exception as e:
print(f"Error loading repository data: {e}")
return 1
print(f"Loaded {len(repos_data)} repositories from {json_path}")
# Initialize the categorizer
categorizer = LLMCategorizer()
categorizer.request_delay = args.delay # Set custom delay between API calls
# Only consolidate categories if requested
if args.only_consolidate:
print("\nOnly consolidating categories as requested...")
categorizer = LLMCategorizer()
categorizer.consolidate_similar_categories()
return 0
# Load current category assignments to check which repos are already categorized
current_assignments = categorizer._load_current_assignments()
# Try to resume from last run if requested
if args.resume_from_last:
# Create reports directory if it doesn't exist
reports_dir = Path("reports")
reports_dir.mkdir(exist_ok=True)
# Get the latest report if resuming from last
report_files = list(reports_dir.glob("categorization_report_*.json"))
if not report_files:
print("No previous reports found to resume from.")
return 1
latest_report = max(report_files, key=lambda f: f.stat().st_mtime)
print(f"Resuming from last report: {latest_report}")
with open(latest_report, "r") as f:
report_data = json.load(f)
# Get the last processed repository
processed_repos = list(report_data["categorization_results"].keys())
if not processed_repos:
print("No processed repositories found in the last report.")
return 1
last_processed = processed_repos[-1]
print(f"Last processed repository: {last_processed}")
# Find the index of the last processed repository
repo_names = [repo.get('name', '') for repo in repos_data]
if last_processed in repo_names:
last_index = repo_names.index(last_processed)
if last_index < len(repos_data) - 1:
repos_data = repos_data[last_index + 1:]
print(f"Skipping {last_index + 1} already processed repositories")
else:
print(f"Warning: Last processed repository {last_processed} not found in repository list.")
# Filter repositories if starting from a specific one
if args.start_from:
start_index = next((i for i, repo in enumerate(repos_data)
if repo.get('name') == args.start_from), 0)
if start_index > 0:
print(f"Starting from repository {args.start_from} (index {start_index})")
repos_data = repos_data[start_index:]
# Filter out already categorized repositories unless --force is used
if not args.force:
uncategorized_repos = []
for repo in repos_data:
repo_name = repo.get('name', '')
if repo_name not in current_assignments or not current_assignments[repo_name]:
uncategorized_repos.append(repo)
if len(uncategorized_repos) < len(repos_data):
print(f"Filtered out {len(repos_data) - len(uncategorized_repos)} already categorized repositories.")
print(f"Processing {len(uncategorized_repos)} uncategorized repositories.")
repos_data = uncategorized_repos
# Limit the number of repositories if max-repos is specified
if args.max_repos and args.max_repos < len(repos_data):
print(f"Limiting to {args.max_repos} repositories as requested")
repos_data = repos_data[:args.max_repos]
# Process repositories in batches
total_repos = len(repos_data)
if total_repos == 0:
print("No repositories to process. All repositories may already be categorized.")
print("Use --force to recategorize already categorized repositories.")
return 0
batch_size = min(args.batch_size, total_repos)
num_batches = (total_repos + batch_size - 1) // batch_size # Ceiling division
# Dry run mode
if args.dry_run:
print("\nDRY RUN MODE - No actual API calls or changes will be made")
print(f"Would process {total_repos} repositories in {num_batches} batches")
print(f"Confidence threshold: {args.min_confidence}%")
print(f"Apply changes: {args.apply}")
print(f"First 5 repositories that would be processed:")
for i, repo in enumerate(repos_data[:5]):
print(f" {i+1}. {repo.get('name', '')}")
return 0
# Skip confirmation and proceed directly
print(f"\nProcessing {total_repos} repositories in {num_batches} batches")
print(f"Confidence threshold: {args.min_confidence}%")
print(f"Apply changes: {args.apply}")
if not args.apply:
print("\nWARNING: Running without --apply will only show suggestions without making changes.")
print("To apply the changes, run with the --apply flag.")
# Track overall statistics
total_suggestions = 0
total_changes = 0
for batch_num in range(num_batches):
start_idx = batch_num * batch_size
end_idx = min(start_idx + batch_size, total_repos)
batch = repos_data[start_idx:end_idx]
print(f"\nProcessing batch {batch_num + 1}/{num_batches} "
f"(repositories {start_idx + 1}-{end_idx} of {total_repos})")
# Process the batch
results = categorizer.batch_categorize(
batch,
min_confidence=args.min_confidence,
apply_changes=args.apply
)
# Count suggestions in this batch
batch_suggestions = sum(len(cats) for cats in results.values())
total_suggestions += batch_suggestions
# Print batch summary
print(f"Batch {batch_num + 1} complete: {batch_suggestions} suggestions for {len(batch)} repositories")
# Consolidate categories after each batch if requested
if args.consolidate and args.apply:
print("\nConsolidating similar categories...")
categorizer.consolidate_similar_categories()
# Print overall summary
print(f"\nCategorization complete: {total_suggestions} suggestions for {total_repos} repositories")
# Final consolidation if requested
if args.consolidate and args.apply:
print("\nPerforming final consolidation of similar categories...")
categorizer.consolidate_similar_categories()
return 0
if __name__ == "__main__":
exit(main())