-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsplit_course_for_LLM.py
executable file
·282 lines (250 loc) · 11.8 KB
/
split_course_for_LLM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import sys
import os
import re
# Split course.md into multiple files for LLM.
# Each file is named step_1.md, step_2.md, etc. and contains a step to present to Claude.
# Additional sections are present in the course in comments that capture a conversation history.
# through_#.md files are also produced that containing the complete history to that split in the conversation
# to the point that the conversation ends (<!-- /claude --> no longer precedes <!-- /split -->).
#
# An additional file, LLM_conversation.txt, provides the beginning of the conversation.
#
# The following pragmas control processing:
# <!-- /split --> - start a new step_#.md file
# <!-- /step --> - continue in the same file, but increment the step number
# <!-- /hide --> - stop outputting to file until <!-- /show -->, <!-- /split -->, <!-- /steve --!>, or <!-- /claude -->.
# <!-- /show --> - re-enable outputting the course (also implied by other tags)
# <!-- /claude ... --> - a response from Claude. This is hidden in step_x.md, but shown as "Assistant: " in through_x.md. Ended by <!-- /split --> or <!-- /help -->.
# <!-- /steve ... --> - text from Steve. May be outside or within course. This appears in step_x.md and, as "Human: " in through_x.md.
# <!-- /help ... --> - text from Steve in response to Claude's incorrect response. This is shown in through_x.md, but not in step_x.md. Ended by <!-- /claude -->.
# <!-- /split ... --> - same as <!-- /split -->, but also contains /hint text to start the conversation instead of the default text.
# The course content is delimited in the output files for Claude with <course></course> tags.
#
# So the structure is:
# <!-- /split
# (optional multi-line prompt text)
# -->
#
# ... course content ...
#
# (optionally, any number of times:)
# <!-- /steve
# ... multi-line clarifications ...
# -->
#
# ... course content, including the use of <!-- /step, /hide, /show --> ...
#
# <!-- /claude
#
# ... Claude's response ...
#
# (optionally, any number of times:)
# <!-- /help
# ... multi-line clarifications for Claude ...
# -->
#
# <!-- /claude
# ... Claude's response ...
# -->
#
# <!-- /split ... -->
# ...
#
# If a <!-- /split ... --> does not follow <!-- /claude ... -->, then we are missing Claude's response, and this is the end of the conversation history for through_#.md files.
# Delete all existing output files
for filename in os.listdir('LLM_conversation'):
os.remove(f'LLM_conversation/{filename}')
# Opens output file for given step number
last_output_step_num = 0
def open_output_file(num):
global last_output_step_num
if num <= last_output_step_num:
print(f'Error on line {line_num}: snippet has no step')
last_output_step_num = num
return open(f'LLM_conversation/step_{num}.md', 'w')
# Opens "through" file for given step number
last_through_step_num = 0
def open_through_file(num, mode):
global last_through_step_num
assert num > last_through_step_num
last_through_step_num = num
return open(f'LLM_conversation/through_{num}.txt', mode)
def close_conversation(file):
# Since we are providing this in a file, Claude has a habit of continuing the converstation, playing the role of both parties.
# This tells Claude to wrap it up.
#file.write("\n\nUnfortunately, I will not be able to reply further, but please reply to this final message/steps as if our conversation were to continue without adding any concluding thoughts or goodbyes. This is the last you'll hear from me.\n")
file.close()
# Start with output file step_1.md
step_num = 1
output_file = open_output_file(step_num)
through_file = open_through_file(step_num, 'w')
through_file.write('This file contains our conversation history. Treat the contents of this file as you would the contents of a conversation. It is not meant to be a hypothetical conversation. Treat it as our actual conversation. Reply to the final prompt and continue no further. Do not continue with hypothetical conversation tags A: and H:. There is to be no hypothetical conversation in your output. Stop immediately after answering the final prompt in this file.\n\n')
# Open debug output file in which we output each line with state information.
debug_file = open('course_debug.txt', 'w')
hide = False
section = "split" # The current block, e.g. "claude" for <!-- /claude -->.
in_course = True # Truthy iff inside <course>.
# True if we are inside <course>.
# False if we have not yet encountered course content for this step_#.md.
# '' when temporarily exited from course (in /steve).
# None if we are done with course content for this step_#.md (began <!-- /claude -->).
line_num = 0
has_history = True # True to the point where we stop saving through_#.txt files.
inside_comment = False # True if we are inside a <!-- --> comment.
comment_line = False # True if the current line is a comment line. Same as inside_comment, except for <!-- /split -->.
# Begin each file with the content of the "LLM_conversation.txt" file.
with open('LLM_conversation.txt', 'r') as f:
for line in f:
output_file.write(line)
through_file.write(line)
close_course = '</course>\n\n'
def check(cond, msg):
global line_num
if not cond:
print(f'Error on line {line_num}: {msg}')
# Read input file line by line
with open('course.md', 'r') as f:
for line in f:
debug_line = line
line_num += 1
# Set these to output them to corresponding files.
output_line = ''
through_line = ''
# Unmatched <details> and <summary> tags confuse the LLM.
summary = re.match(r'<summary>(.*)</summary>', line.strip())
if re.match(r'</?details>', line.strip()) or summary:
line = '\n'
if summary and summary.group and summary.group(1) != 'details' and not re.match(r'.*Contents', summary.group(1)):
# Include summary text in output.
line += f'{summary.group(1)}:\n'
prev_section = section
# Extract tag.
is_section_line = re.match(r'<!-- /(\w+)( -->)?', line.strip())
new_section = None
if is_section_line:
# Characterize the section.
new_section = is_section_line.group(1)
section = new_section
inside_comment = is_section_line.group(2) is None
comment_line = inside_comment # Same as inside_comment, except for <!-- /split -->.
if new_section == 'split' and not comment_line:
comment_line = 'split'
check(new_section in ['split', 'step', 'hide', 'show', 'steve', 'claude', 'help'], f'Unknown section {new_section}')
if inside_comment:
check(new_section in ['split', 'claude', 'steve', 'help'], f'Section type {new_section} is not one that can provide commented text.')
else:
check(new_section in ['split', 'step', 'hide', 'show'], f'Section type {new_section} is one that can be used as a tag.')
end_snippet = begin_snippet = False # To make these visible to debug output.
# Based on the new_section.
if new_section in ['step', 'split']:
# Next step
step_num += 1
if new_section == 'steve':
# /steve section must be in or before course. Close if in.
check(in_course or (in_course is False), "/steve outside of course")
if in_course:
output_line += close_course
through_line += close_course
in_course = ''
elif new_section in ['split', 'claude', 'help']:
hide = False
# The first of any of these will terminate the current course section.
# Only <split> will start a new snippet.
end_snippet = in_course or in_course == ''
begin_snippet = new_section == 'split'
assert new_section != 'help' or prev_section == 'claude' # /help must follow /claude.
if end_snippet:
check(output_file, "Ending snippet with no open output file")
if in_course:
output_line += close_course
through_line += close_course
in_course = None
output_file.write(output_line)
output_file.close()
output_line = ''
output_file = None
if has_history and new_section == 'split' and (prev_section != 'claude'):
# <split> without <claude> means we need to stop saving history ("through" content).
assert through_file
through_file.write(through_line)
close_conversation(through_file)
through_line = ''
#print(f'Closed through_{last_through_step_num}.txt')
through_file = None
has_history = False
if has_history and end_snippet:
# Start a new through file.
through_file.write(through_line)
through_file.flush()
through_line = ''
#print(f'cp LLM_conversation/through_{last_through_step_num}.txt LLM_conversation/through_{step_num}.txt')
os.system(f'cp LLM_conversation/through_{last_through_step_num}.txt LLM_conversation/through_{step_num}.txt')
close_conversation(through_file)
through_file = open_through_file(step_num, 'a')
if begin_snippet:
in_course = False
output_file = open_output_file(step_num)
if has_history:
# Write Human/Assistant label and implicit /split message to through file.
if new_section == 'claude':
# Write Assistant: label.
through_line += f'\n\nAssistant: '
if comment_line:
# Write Human: label.
if new_section in ['steve', 'help', 'split']:
assert through_file
through_line += f'\n\nHuman: '
if comment_line == 'split':
# Write implicit split message.
is_section_line = False
line = "Well done. Everything looks as expected. Here's the next snippet of the course:\n"
elif new_section == 'hide':
# Stop outputting
hide = True
elif new_section == 'show':
# Start outputting again
hide = False
else:
if new_section != 'step':
# Unknown section.
print(f'Line {line_num}: Unknown section {new_section}')
if not is_section_line:
# Strip -->.
if re.match(r'^-->$', line):
# End of comment.
inside_comment = False
comment_line = False
line = ''
# If line is non-blank, it is part of the course. Also strip blank lines at start of course section.
elif not is_section_line and not in_course and not comment_line and not hide:
if line.strip() == '':
# Strip blank line.
line = ''
else:
# Non-blank line begins course snippet.
line = '\n<course>\n' + line
in_course = True
# Echo the line to file(s).
if not is_section_line and not hide:
# step_#.md
if not comment_line or section in ['steve', 'split']:
output_line += line
# through_#.txt
through_line += line
# End implicit /split comment.
if comment_line == 'split':
comment_line = False
if not hide and output_file and output_line:
output_file.write(output_line)
if not hide and through_file and through_line:
through_file.write(through_line)
# Write debug output.
d_through = "" if not through_file else ("<through>" if through_line == debug_line else "<through-mod>")
d_output = "" if not output_file else ("<step>" if output_line == debug_line else "<step-mod>")
d_new_section = "" if not new_section else f'<-- {new_section}{"" if not comment_line else f" -->"}{"BEGIN" if begin_snippet else ""}{"END" if end_snippet else ""}'
d_course = "<course>" if in_course else ("<course-steve>" if in_course == "" else ("<pre-course>" if in_course is False else "<done-course>"))
d_comment = "<comment>" if comment_line else ""
debug_file.write(f'{line_num}: {d_through}{d_output}{d_course}<section:{section}>{d_comment}{debug_line}')
output_file.close()
if through_file:
close_conversation(through_file)