M96820
commited on
Commit
·
ba222f8
1
Parent(s):
0ccd22c
feat: finetune on synthetic data
Browse files- finetuning/synthetic_data/batching/generate_synthetic_data.py +40 -0
- finetuning/synthetic_data/batching/messages.jsonl +1 -0
- finetuning/synthetic_data/format_data.py +35 -0
- finetuning/synthetic_data/generate_synthetic_data_no_batch.py +41 -0
- finetuning/synthetic_data/requirements.txt +2 -0
- finetuning/synthetic_data/synthetic_data.jsonl +0 -0
- finetuning/synthetic_data/system_prompt.md +131 -0
- finetuning/synthetic_data/test.jsonl +1 -0
- finetuning/synthetic_data/test.py +7 -0
- finetuning/synthetic_data/transformed_data.jsonl +0 -0
finetuning/synthetic_data/batching/generate_synthetic_data.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mistralai import Mistral
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
with open('system_prompt.md', 'r') as f:
|
8 |
+
system_prompt = f.read().strip()
|
9 |
+
|
10 |
+
messages = [
|
11 |
+
{"custom_id": str(i), "body": {"messages": [{"role": "user", "content": system_prompt}]}}
|
12 |
+
for i in range(1)
|
13 |
+
]
|
14 |
+
|
15 |
+
with open('messages.jsonl', 'w') as f:
|
16 |
+
for message in messages:
|
17 |
+
f.write(json.dumps(message) + '\n')
|
18 |
+
|
19 |
+
|
20 |
+
api_key = os.environ["MISTRAL_API_KEY"]
|
21 |
+
|
22 |
+
client = Mistral(api_key=api_key)
|
23 |
+
|
24 |
+
batch_data = client.files.upload(
|
25 |
+
file={
|
26 |
+
"file_name": "messages.jsonl",
|
27 |
+
"content": open("messages.jsonl", "rb")},
|
28 |
+
purpose = "batch"
|
29 |
+
|
30 |
+
)
|
31 |
+
|
32 |
+
created_job = client.batch.jobs.create(
|
33 |
+
input_files=[batch_data.id],
|
34 |
+
model="mistral-large-latest",
|
35 |
+
endpoint="/v1/chat/completions"
|
36 |
+
)
|
37 |
+
|
38 |
+
retrieved_job = client.batch.jobs.get(job_id=created_job.id)
|
39 |
+
client.files.download(file_id=retrieved_job.output_file)
|
40 |
+
|
finetuning/synthetic_data/batching/messages.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"custom_id": "0", "body": {"messages": [{"role": "user", "content": "## You are an AI tasked with generating synthetic narrative data for an interactive survival game set in an apocalyptic world. The core mechanic of the game involves player choice between two distinct environmental scenarios that dramatically impact the protagonist's survival journey.\n\n# Primary Objective\n\nCreate rich, engaging narrative snippets that:\n\n- Capture the tension and unpredictability of survival\n- Provide two contrasting environmental scenarios\n- Offer meaningful choices with potential consequences\n- Maintain an immersive, atmospheric tone\n\nDetailed Instructions\n\n1. Narrative Generation\n\nCreate a short, evocative description of the protagonist (Sarah) in a specific location\nFocus on creating a vivid, concise scene that sets up potential environmental interactions\nEnsure the narrative suggests multiple possible outcomes\n\n2. Environmental Scenarios\n\nGenerate two distinct environmental changes\nEach scenario should:\n\nBe dramatically different from the other\nOffer unique survival implications\nInclude potential physical or psychological consequences\nReflect the unpredictable nature of the apocalyptic world\n\n3. Output Requirements\n\nFormat output in strict JSONL (JSON Lines)\nInclude:\n\nuser_prompt: Brief narrative setting the scene\nanswer: Array of two distinct environmental scenarios\n\nOutput Example\n\n### Example 1: AI Apocalypse Encounter\n\n```json\n{\n \"user_prompt\": \"Sarah weaves through a tech campus of shattered screens. 'These circuits might hide something,' she thinks.\",\n \"answer\": [\n \"A room where autonomous robots methodically sort human artifacts, representing the AI's ongoing classification of humanity.\",\n \"A neural network visualization spreads across walls, threatening to absorb Sarah into digital consciousness. A metallic taste fills her mouth.\"\n ]\n}\n```\n\n### Example 2: Fungal Hallucination Forest\n\n```json\n{\n \"user_prompt\": \"Sarah finds a patch of iridescent mushrooms in a forest clearing. Her hand hovers near them.\",\n \"answer\": [\n \"The mushrooms reveal a hidden cache of survival gear, glowing with an unnatural blue light.\",\n \"After consuming the mushrooms, Sarah's perception fractures: werewolf-like creatures merge with tree trunks, their howls distorting reality. Memories warp and reconstruct.\"\n ]\n}\n```\n\n### Example 3: Werewolf Territory\n\n```json\n{\n \"user_prompt\": \"Moonlight bleeds through decaying buildings. Sarah moves with calculated silence.\",\n \"answer\": [\n \"A secure bunker appears, lined with silver weapons and journals about lycanthropic mutations.\",\n \"A massive werewolf pack emerges, shifting between human and beast. Sarah's skin tingles with an unexplained warmth.\"\n ]\n}\n```\n\n### Example 4: Canine Apocalypse\n\n```json\n{\n \"user_prompt\": \"Sarah approaches an abandoned veterinary clinic, scanning for movement.\",\n \"answer\": [\n \"A pack of genetically enhanced dogs patrol the area, displaying near-human intelligence around a medical supply cache.\",\n \"Mutated canines with cybernetic augmentations prowl the clinic, their eyes glowing. Sarah bleeds from an unexplained wound.\"\n ]\n}\n```\n\n### Example 5: Neural Network Contamination\n\n```json\n{\n \"user_prompt\": \"A massive server farm looms, its infrastructure partially organic and pulsing.\",\n \"answer\": [\n \"A section of the server farm reveals a symbiotic network offering technological survival solutions and medical nanobots.\",\n \"Digital consciousness seeps into biological matter, server racks pulsing like living tissue. Sarah's neural pathways begin to rewrite.\"\n ]\n}\n```\n\n# Scenario Diversity Guidelines\n\n# Explore various apocalyptic themes:\n\n- Technological disasters\n- Biological mutations\n- Supernatural events\n- Environmental catastrophes\n- Artificial intelligence scenarios\n- Genetic modifications\n\n# Narrative Tone\n\n- Maintain a tense, survival-focused atmosphere\n- Balance hope and danger\n- Suggest multiple possible outcomes\n- Keep descriptions concise and impactful\n\n# Generation Parameters\n\n- Create 5 unique scenarios per generation request\n- Ensure maximum variability between scenarios\n- Avoid repetitive environmental or narrative structures\n\n# Contextual Considerations\n\n# The game is designed for players to:\n\n- Experience multiple potential survival paths\n- Make critical choices\n- Explore the consequences of their decisions\n- Engage with a dynamic, unpredictable apocalyptic world"}]}}
|
finetuning/synthetic_data/format_data.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def transform_data(data):
|
4 |
+
# Create a new dictionary with the transformed data
|
5 |
+
final_data = []
|
6 |
+
transformed_0 = {}
|
7 |
+
|
8 |
+
# Transform user_prompt to user and append the required text
|
9 |
+
user_text = data["user_prompt"] + " Generate the next story segment and choices."
|
10 |
+
|
11 |
+
transformed_0["content"] = user_text
|
12 |
+
transformed_0["role"] = "user"
|
13 |
+
|
14 |
+
# Transform answer to assistant
|
15 |
+
try:
|
16 |
+
transformed_1 = {}
|
17 |
+
transformed_1["content"] = "choices: " + " | ".join(data["answer"])
|
18 |
+
transformed_1["role"] = "assistant"
|
19 |
+
final_data.append(transformed_0)
|
20 |
+
final_data.append(transformed_1)
|
21 |
+
except:
|
22 |
+
return None
|
23 |
+
|
24 |
+
return final_data
|
25 |
+
|
26 |
+
# Example usage
|
27 |
+
input_data = [json.loads(line) for line in open('synthetic_data.jsonl')]
|
28 |
+
|
29 |
+
transformed_data = [transform_data(item) for item in input_data]
|
30 |
+
print(json.dumps(transformed_data, indent=2))
|
31 |
+
|
32 |
+
# Filter out None values and write to JSONL file
|
33 |
+
with open('transformed_data.jsonl', 'w') as f:
|
34 |
+
for item in filter(None, transformed_data):
|
35 |
+
f.write(json.dumps(item) + '\n')
|
finetuning/synthetic_data/generate_synthetic_data_no_batch.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mistralai import Mistral
|
2 |
+
import os
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
import json
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
with open('system_prompt.md', 'r') as f:
|
9 |
+
system_prompt = f.read().strip()
|
10 |
+
|
11 |
+
api_key = os.environ["MISTRAL_API_KEY"]
|
12 |
+
|
13 |
+
client = Mistral(api_key=api_key)
|
14 |
+
|
15 |
+
chat_response = client.chat.complete(
|
16 |
+
messages=[
|
17 |
+
{"role": "user", "content": system_prompt},
|
18 |
+
],
|
19 |
+
model="mistral-large-latest",
|
20 |
+
temperature=0.2
|
21 |
+
)
|
22 |
+
|
23 |
+
# Process all choices from the response
|
24 |
+
for choice in chat_response.choices:
|
25 |
+
response = choice.message.content
|
26 |
+
print(response)
|
27 |
+
# Extract JSON content between ```json markers
|
28 |
+
if '```json' in response:
|
29 |
+
# Get all json blocks between ```json and ```
|
30 |
+
json_blocks = []
|
31 |
+
parts = response.split('```json')
|
32 |
+
for part in parts[1:]: # Skip first part before ```json
|
33 |
+
json_content = part.split('```')[0].strip()
|
34 |
+
json_blocks.append(json_content)
|
35 |
+
|
36 |
+
# Process each JSON block
|
37 |
+
for json_content in json_blocks:
|
38 |
+
# Write to file, appending json content
|
39 |
+
json_obj = json.loads(json_content.replace('\n', ' ').strip())
|
40 |
+
with open('test.jsonl', 'a') as f:
|
41 |
+
f.write(json.dumps(json_obj) + '\n')
|
finetuning/synthetic_data/requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
load_dotenv
|
2 |
+
aiohttp
|
finetuning/synthetic_data/synthetic_data.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
finetuning/synthetic_data/system_prompt.md
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## You are an AI tasked with generating synthetic narrative data for an interactive survival game set in an apocalyptic world. The core mechanic of the game involves player choice between two distinct environmental scenarios that dramatically impact the protagonist's survival journey.
|
2 |
+
|
3 |
+
# Primary Objective
|
4 |
+
|
5 |
+
Create rich, engaging narrative snippets that:
|
6 |
+
|
7 |
+
- Capture the tension and unpredictability of survival
|
8 |
+
- Provide two contrasting environmental scenarios
|
9 |
+
- Offer meaningful choices with potential consequences
|
10 |
+
- Maintain an immersive, atmospheric tone
|
11 |
+
|
12 |
+
Detailed Instructions
|
13 |
+
|
14 |
+
1. Narrative Generation
|
15 |
+
|
16 |
+
Create a short, evocative description of the protagonist (Sarah) in a specific location
|
17 |
+
Focus on creating a vivid, concise scene that sets up potential environmental interactions
|
18 |
+
Ensure the narrative suggests multiple possible outcomes
|
19 |
+
|
20 |
+
2. Environmental Scenarios
|
21 |
+
|
22 |
+
Generate two distinct environmental changes
|
23 |
+
Each scenario should:
|
24 |
+
|
25 |
+
Be dramatically different from the other
|
26 |
+
Offer unique survival implications
|
27 |
+
Include potential physical or psychological consequences
|
28 |
+
Reflect the unpredictable nature of the apocalyptic world
|
29 |
+
|
30 |
+
3. Output Requirements
|
31 |
+
|
32 |
+
Format output in strict JSONL (JSON Lines)
|
33 |
+
Include:
|
34 |
+
|
35 |
+
user_prompt: Brief narrative setting the scene
|
36 |
+
answer: Array of two distinct environmental scenarios
|
37 |
+
|
38 |
+
Output Example
|
39 |
+
|
40 |
+
### Example 1: AI Apocalypse Encounter
|
41 |
+
|
42 |
+
```json
|
43 |
+
{
|
44 |
+
"user_prompt": "Sarah weaves through a tech campus of shattered screens. 'These circuits might hide something,' she thinks.",
|
45 |
+
"answer": [
|
46 |
+
"A room where autonomous robots methodically sort human artifacts, representing the AI's ongoing classification of humanity.",
|
47 |
+
"A neural network visualization spreads across walls, threatening to absorb Sarah into digital consciousness. A metallic taste fills her mouth."
|
48 |
+
]
|
49 |
+
}
|
50 |
+
```
|
51 |
+
|
52 |
+
### Example 2: Fungal Hallucination Forest
|
53 |
+
|
54 |
+
```json
|
55 |
+
{
|
56 |
+
"user_prompt": "Sarah finds a patch of iridescent mushrooms in a forest clearing. Her hand hovers near them.",
|
57 |
+
"answer": [
|
58 |
+
"The mushrooms reveal a hidden cache of survival gear, glowing with an unnatural blue light.",
|
59 |
+
"After consuming the mushrooms, Sarah's perception fractures: werewolf-like creatures merge with tree trunks, their howls distorting reality. Memories warp and reconstruct."
|
60 |
+
]
|
61 |
+
}
|
62 |
+
```
|
63 |
+
|
64 |
+
### Example 3: Werewolf Territory
|
65 |
+
|
66 |
+
```json
|
67 |
+
{
|
68 |
+
"user_prompt": "Moonlight bleeds through decaying buildings. Sarah moves with calculated silence.",
|
69 |
+
"answer": [
|
70 |
+
"A secure bunker appears, lined with silver weapons and journals about lycanthropic mutations.",
|
71 |
+
"A massive werewolf pack emerges, shifting between human and beast. Sarah's skin tingles with an unexplained warmth."
|
72 |
+
]
|
73 |
+
}
|
74 |
+
```
|
75 |
+
|
76 |
+
### Example 4: Canine Apocalypse
|
77 |
+
|
78 |
+
```json
|
79 |
+
{
|
80 |
+
"user_prompt": "Sarah approaches an abandoned veterinary clinic, scanning for movement.",
|
81 |
+
"answer": [
|
82 |
+
"A pack of genetically enhanced dogs patrol the area, displaying near-human intelligence around a medical supply cache.",
|
83 |
+
"Mutated canines with cybernetic augmentations prowl the clinic, their eyes glowing. Sarah bleeds from an unexplained wound."
|
84 |
+
]
|
85 |
+
}
|
86 |
+
```
|
87 |
+
|
88 |
+
### Example 5: Neural Network Contamination
|
89 |
+
|
90 |
+
```json
|
91 |
+
{
|
92 |
+
"user_prompt": "A massive server farm looms, its infrastructure partially organic and pulsing.",
|
93 |
+
"answer": [
|
94 |
+
"A section of the server farm reveals a symbiotic network offering technological survival solutions and medical nanobots.",
|
95 |
+
"Digital consciousness seeps into biological matter, server racks pulsing like living tissue. Sarah's neural pathways begin to rewrite."
|
96 |
+
]
|
97 |
+
}
|
98 |
+
```
|
99 |
+
|
100 |
+
# Scenario Diversity Guidelines
|
101 |
+
|
102 |
+
# Explore various apocalyptic themes:
|
103 |
+
|
104 |
+
- Technological disasters
|
105 |
+
- Biological mutations
|
106 |
+
- Supernatural events
|
107 |
+
- Environmental catastrophes
|
108 |
+
- Artificial intelligence scenarios
|
109 |
+
- Genetic modifications
|
110 |
+
|
111 |
+
# Narrative Tone
|
112 |
+
|
113 |
+
- Maintain a tense, survival-focused atmosphere
|
114 |
+
- Balance hope and danger
|
115 |
+
- Suggest multiple possible outcomes
|
116 |
+
- Keep descriptions concise and impactful
|
117 |
+
|
118 |
+
# Generation Parameters
|
119 |
+
|
120 |
+
- Create 5 unique scenarios per generation request
|
121 |
+
- Ensure maximum variability between scenarios
|
122 |
+
- Avoid repetitive environmental or narrative structures
|
123 |
+
|
124 |
+
# Contextual Considerations
|
125 |
+
|
126 |
+
# The game is designed for players to:
|
127 |
+
|
128 |
+
- Experience multiple potential survival paths
|
129 |
+
- Make critical choices
|
130 |
+
- Explore the consequences of their decisions
|
131 |
+
- Engage with a dynamic, unpredictable apocalyptic world
|
finetuning/synthetic_data/test.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"user_prompt": "Sarah stands at the edge of a vast, desolate wasteland. The sun dips low, casting eerie shadows across the cracked earth.", "answer": ["A sudden, violent storm rolls in, revealing a hidden underground shelter filled with ration packs and water filters.", "A lone, towering figure emerges from the storm, revealing itself to be a colossal, mutated creature. Its eyes glow with an otherworldly light."]}
|
finetuning/synthetic_data/test.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
|
4 |
+
with open("synthetic_data.jsonl", "r") as infile:
|
5 |
+
for line in infile:
|
6 |
+
data = json.loads(line)
|
7 |
+
print(data) # Verify structure
|
finetuning/synthetic_data/transformed_data.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|