Upload folder using huggingface_hub

a283ad3 verified 5 days ago

75.9 kB

	# Copyright (c) 2025, Anemll All rights reserved.
	#
	# Use of this source code is governed by a MIT license that can be
	# found in the LICENSE.txt file or at https://opensource.org/license/mit

	import coremltools as ct
	import numpy as np
	import torch
	from transformers import AutoTokenizer
	import os
	import time, sys
	import signal
	import traceback
	import torch.nn.functional as F
	import queue
	import threading
	import re

	# Configuration
	CONTEXT_LENGTH = 1024 # Changed default from 512 to 1024
	PREFILL_BATCH_SIZE = 64
	MODEL_PATH = os.path.expanduser("../DeepSeekR1-8B")
	ENABLE_VACAB_SPLIT8 = True # Enable 8-way vocab split
	ENABLE_LOGITS2 = False # Enable 2-way vocab split
	ENABLE_DEBUG = bool(0)
	ENABLE_ARGMAX = bool(0)
	ENABLE_PREFILL_BATCH = bool(1)
	ENABLE_CHAT_DEBUG = bool(0) # Debug flag for chat loop

	# ANSI color codes
	LIGHT_BLUE = "\033[94m"
	DARK_BLUE = "\033[34m"
	LIGHT_GREEN = "\033[92m"
	RESET_COLOR = "\033[0m"

	if ENABLE_LOGITS2:
	assert not ENABLE_ARGMAX, "ENABLE_ARGMAX must be False when ENABLE_LOGITS2 is True"


	def load_model(path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name=None):
	"""Load either compiled or uncompiled CoreML model.

	Args:
	path: Path to the model file (.mlmodelc or .mlpackage)
	compute_unit: CoreML compute unit to use
	function_name: Optional function name to select from multi-function models
	"""
	DebugLog(f"Attempting to load model: {path}")
	DebugLog(f"File exists: {os.path.exists(path)}")
	DebugLog(f"Is directory (for mlmodelc): {os.path.isdir(path)}")

	try:
	if path.endswith('.mlmodelc'):
	DebugLog(f"Loading compiled model: {path}")
	if function_name is None:
	DebugLog("Loading without function name")
	model = ct.models.CompiledMLModel(path, compute_unit)
	else:
	DebugLog(f"Loading with function name: {function_name}")
	model = ct.models.CompiledMLModel(path, compute_unit, function_name=function_name)
	else:
	DebugLog(f"Loading uncompiled model: {path}")
	if function_name is None:
	DebugLog("Loading without function name")
	model = ct.models.MLModel(model=path, compute_units=compute_unit, is_temp_package=False)
	else:
	DebugLog(f"Loading with function name: {function_name}")
	model = ct.models.MLModel(model=path, compute_units=compute_unit, is_temp_package=False, function_name=function_name)
	DebugLog("Model loaded successfully")

	return model

	except Exception as e:
	DebugLog(f"Error loading model: {str(e)}")
	DebugLog(f"Error type: {type(e)}")
	raise

	class SplitModelInference:
	def __init__(self, model_parts, model_dir="."):
	"""Initialize split model inference.

	Args:
	model_parts (list): List of model part numbers to load
	Special cases:
	- 'C123' for combined part2 with prefill/infer functions
	- 'S123' for split model with prefill/infer functions
	- 'Q123' for quad split (2Q1-2Q4)
	- 'Q123S' for quad split with combined prefill/infer (2Q1S-2Q4S)
	- '123D' for dual split without prefill/infer (2D1-2D2)
	model_dir (str): Directory containing the model files (default: current directory)
	"""
	self.context_size = CONTEXT_LENGTH
	self.model_dir = model_dir
	DebugLog(f"Loading models from directory: {self.model_dir}")

	# Parse configuration
	self.quant_configs = {}
	global_lut = None
	if model_parts and model_parts[-1].startswith('lut'):
	global_lut = model_parts[-1]
	model_parts = model_parts[:-1]

	# Special handling for different split modes
	if len(model_parts) == 1:
	if model_parts[0] == '123D': # Dual split without prefill/infer
	self.use_combined_part2 = False
	self.use_split_model = True
	self.use_split_functions = False
	self.use_quad_split = False
	self.use_quad_split_combined = False
	self.model_parts = ['1', '2D1', '2D2', '3']
	if global_lut:
	self.quant_configs = {part: global_lut for part in self.model_parts}
	DebugLog(f"Using dual split model with parts: {self.model_parts}")
	elif model_parts[0].startswith('C123'): # Combined part2
	self.use_combined_part2 = True
	self.use_split_model = False
	self.use_split_functions = False
	self.use_quad_split = False
	self.use_quad_split_combined = False
	self.model_parts = ['1', '2', '3']
	if global_lut:
	self.quant_configs = {part: global_lut for part in self.model_parts}
	DebugLog(f"Using combined part2 model with parts: {self.model_parts}")
	elif model_parts[0].startswith('S123'): # Split model with prefill/infer functions
	self.use_combined_part2 = False
	self.use_split_model = True
	self.use_split_functions = True
	self.use_quad_split = False
	self.use_quad_split_combined = False
	self.model_parts = ['1', '2D1S', '2D2S', '3']
	elif model_parts[0].startswith('Q123S'): # Quad split with combined prefill/infer
	self.use_combined_part2 = False
	self.use_split_model = True
	self.use_split_functions = False
	self.use_quad_split = False
	self.use_quad_split_combined = True
	self.model_parts = ['1', '2Q1S', '2Q2S', '2Q3S', '2Q4S', '3']
	elif model_parts[0].startswith('Q123'): # Regular quad split
	self.use_combined_part2 = False
	self.use_split_model = True
	self.use_split_functions = False
	self.use_quad_split = True
	self.use_quad_split_combined = False
	self.model_parts = ['1', '2Q1', '2Q2', '2Q3', '2Q4', '3']
	else:
	self.use_combined_part2 = False
	self.use_split_model = False
	self.use_split_functions = False
	self.use_quad_split = False
	self.use_quad_split_combined = False
	self.model_parts = model_parts
	else:
	self.use_combined_part2 = False
	self.use_split_model = False
	self.use_split_functions = False
	self.use_quad_split = False
	self.use_quad_split_combined = False
	self.model_parts = model_parts

	# Apply global quantization if specified
	if global_lut and not self.use_combined_part2: # Skip if already applied for C123
	self.quant_configs = {part: global_lut for part in self.model_parts}

	DebugLog(f"Using model parts: {self.model_parts}")
	if global_lut:
	DebugLog(f"With global quantization: {global_lut}")
	if self.use_combined_part2:
	DebugLog("Using combined part2 model with prefill/infer functions")
	elif self.use_split_functions:
	DebugLog("Using split model with prefill/infer functions")
	elif self.use_quad_split:
	DebugLog("Using quad split transformer model (2Q1-2Q4)")
	elif self.use_quad_split_combined:
	DebugLog("Using combined quad split transformer model (2Q1S-2Q4S)")

	self.models = {}
	self.states = {}
	self.load_models()

	def find_model_path(self, base_name, description="model"):
	"""Find model path, checking mlmodelc first then mlpackage.
	Also tries both with and without lut suffix.

	Args:
	base_name: Base name of the model without extension
	description: Description for error message (e.g., "Split model part 2D1S")

	Returns:
	str: Path to the found model file

	Raises:
	FileNotFoundError: If neither mlmodelc nor mlpackage exists
	"""
	# For quad split parts, only try mlmodelc
	if any(part in base_name for part in ['2Q1S', '2Q2S', '2Q3S', '2Q4S', '2Q1', '2Q2', '2Q3', '2Q4']):
	model_path = os.path.join(self.model_dir, f"{base_name}.mlmodelc")
	if os.path.exists(model_path):
	return model_path
	# If not found, try without lut suffix
	if '_lut' in base_name:
	base_without_lut = base_name.split('_lut')[0]
	model_path = os.path.join(self.model_dir, f"{base_without_lut}.mlmodelc")
	if os.path.exists(model_path):
	return model_path
	# Neither exists
	raise FileNotFoundError(f"{description} not found: {base_name}.mlmodelc does not exist" +
	(f" (also tried {base_name.split('_lut')[0]}.mlmodelc)" if '_lut' in base_name else ""))

	# For other parts, try both mlmodelc and mlpackage
	for ext in ['.mlmodelc', '.mlpackage']:
	model_path = os.path.join(self.model_dir, f"{base_name}{ext}")
	if os.path.exists(model_path):
	return model_path

	# If not found, try without lut suffix
	if '_lut' in base_name:
	base_without_lut = base_name.split('_lut')[0]
	for ext in ['.mlmodelc', '.mlpackage']:
	model_path = os.path.join(self.model_dir, f"{base_without_lut}{ext}")
	if os.path.exists(model_path):
	return model_path

	# Neither exists
	raise FileNotFoundError(f"{description} not found: neither {base_name}.mlmodelc nor {base_name}.mlpackage exist in {self.model_dir}" +
	(f" (also tried {base_name.split('_lut')[0]}.mlmodelc/mlpackage)" if '_lut' in base_name else ""))

	def load_models(self):
	"""Load each model part."""
	DebugLog("Loading model parts...")

	for part in self.model_parts:
	quant_suffix = f"_{self.quant_configs[part]}" if part in self.quant_configs else ""
	model_key = f"{part}{quant_suffix}" # Use this as the key in self.models

	try:
	if part == '2' and self.use_combined_part2:
	# Load combined part2 with multiple functions
	base_name = f"llama32_part2_combined{quant_suffix}"
	model_path = self.find_model_path(base_name, "Combined part2 model")

	DebugLog(f"Loading combined part2 model: {model_path}")
	# Load prefill function
	self.models['2_prefill'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='prefill')
	# Load infer function
	self.models['2_infer'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='infer')
	# Create shared state
	self.states['transformer'] = self.models['2_prefill'].make_state()
	DebugLog("Combined part2 model loaded successfully")
	elif part == '2' and not self.use_combined_part2:
	# Load regular part2 model
	base_name = f"llama32_part2{quant_suffix}"
	model_path = self.find_model_path(base_name, "Regular part2 model")

	DebugLog(f"Loading regular part2 model: {model_path}")
	self.models[model_key] = load_model(model_path)
	self.states['transformer'] = self.models[model_key].make_state()
	DebugLog("Regular part2 model loaded successfully")
	elif part in ['2D1S', '2D2S'] and self.use_split_functions:
	# Load split model with prefill/infer functions
	base_name = f"llama32_part{part}{quant_suffix}"
	model_path = self.find_model_path(base_name, f"Split model part {part}")

	DebugLog(f"Loading split model part {part}: {model_path}")
	# Load prefill function
	self.models[f'{part}_prefill'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='prefill')
	# Load infer function
	self.models[f'{part}_infer'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='infer')
	# Create shared state for first part only
	if part == '2D1S':
	self.states['transformer'] = self.models[f'{part}_infer'].make_state()
	DebugLog(f"Split model part {part} loaded successfully")
	elif part.endswith('S') and self.use_quad_split_combined:
	# Load combined quad split model with prefill/infer functions
	base_name = f"llama32_part{part}{quant_suffix}"
	model_path = self.find_model_path(base_name, f"Combined quad split part {part}")

	DebugLog(f"Loading combined quad split part {part}: {model_path}")
	# Load prefill function
	self.models[f'{part}_prefill'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='prefill')
	# Load infer function
	self.models[f'{part}_infer'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='infer')
	# Create shared state for first part only
	if part == '2Q1S':
	self.states['transformer'] = self.models[f'{part}_infer'].make_state()
	DebugLog(f"Created shared transformer state for all quad split parts")
	DebugLog(f"Combined quad split part {part} loaded successfully")
	elif part.startswith('2Q') and self.use_quad_split:
	# Load quad split model with prefill/infer functions
	# Append 'S' to part name for file lookup
	base_name = f"llama32_part{part}S{quant_suffix}"
	model_path = self.find_model_path(base_name, f"Quad split part {part}")

	DebugLog(f"Loading quad split part {part}: {model_path}")
	# Load prefill function
	self.models[f'{part}_prefill'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='prefill')
	# Load infer function
	self.models[f'{part}_infer'] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE, function_name='infer')
	# Create shared state for first part only
	if part == '2Q1':
	self.states['transformer'] = self.models[f'{part}_infer'].make_state()
	DebugLog(f"Created shared transformer state for all quad split parts")
	print(f"Created shared transformer state for all quad split parts")
	print(f"Quad split part {part} loaded successfully")
	else:
	# Load regular models (part 1 and part3)
	base_name = f"llama32_part{part}{quant_suffix}"
	model_path = self.find_model_path(base_name, f"Regular part {part}")

	print(f"[MODEL LOAD] Regular part {part}:")
	print(f" - File: {model_path}")
	print(f" - Loading as: '{model_key}'")

	# Try loading with CPU first, then fall back to CPU_AND_NE if needed
	try:
	self.models[model_key] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU_AND_NE)
	print(f" - Loaded with CPU_AND_NE compute unit")
	except Exception as cpu_error:
	print(f" - CPU load failed, trying CPU_AND_NE: {str(cpu_error)}")
	self.models[model_key] = load_model(model_path, compute_unit=ct.ComputeUnit.CPU)
	print(f" - Loaded with CPU compute unit")

	print(f"[MODEL LOAD] Current model_parts keys: {list(self.models.keys())}")

	except Exception as e:
	print(f"Error loading model part {part}: {str(e)}")
	raise

	def run_transformer_prefill(self, hidden_states, update_mask, position_ids, causal_mask, current_pos):
	"""Run the transformer model in prefill mode."""
	if self.use_split_functions:
	# Use prefill variants for split model
	for part in ['2D1S', '2D2S']:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': causal_mask.numpy(),
	'start_pos': current_pos.numpy()
	}
	output = self.models[f'{part}_prefill'].predict(inputs, self.states['transformer'])
	hidden_states = torch.from_numpy(output['dummy_output'])
	return hidden_states
	else:
	# Use existing prefill implementation
	return super().run_transformer_prefill(hidden_states, update_mask, position_ids, causal_mask, current_pos)

	def run_transformer_infer(self, hidden_states, update_mask, position_ids, causal_mask, current_pos):
	"""Run the transformer model in infer mode."""
	if self.use_split_functions:
	# Use infer variants for split model
	for part in ['2D1S', '2D2S']:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': causal_mask.numpy(),
	'current_pos': current_pos.numpy()
	}
	output = self.models[f'{part}_infer'].predict(inputs, self.states['transformer'])
	hidden_states = torch.from_numpy(output['transformer_output'])
	return hidden_states
	else:
	# Use existing infer implementation
	return super().run_transformer_infer(hidden_states, update_mask, position_ids, causal_mask, current_pos)

	def get_state(self, part):
	"""Get the appropriate state for a model part."""
	return self.states['transformer']

	def run_embeddings(self, input_ids):
	"""Run the embeddings model (part 1)."""
	if '1' not in self.models:
	raise ValueError("Embeddings model (part 1) not loaded")

	output_dict = self.models['1'].predict({
	'input_ids': input_ids.numpy()
	})
	return torch.from_numpy(output_dict['hidden_states'])

	def run_transformer(self, hidden_states, update_mask, position_ids, causal_mask, current_pos, part='2'):
	"""Run the transformer model."""
	if part not in self.models:
	raise ValueError(f"Transformer model (part {part}) not loaded")

	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': causal_mask.numpy(),
	'current_pos': current_pos.numpy()
	}

	output_dict = self.models[part].predict(inputs, self.get_state(part))
	return torch.from_numpy(output_dict['transformer_output'])

	def run_transformer_splits(self, hidden_states, update_mask, position_ids, causal_mask, current_pos):
	"""Run through transformer splits based on model configuration."""
	if not self.use_split_model:
	return self.run_transformer(hidden_states, update_mask, position_ids, causal_mask, current_pos)

	# Handle different split configurations
	if any(part.startswith('2Q') for part in self.model_parts): # Quad split
	for i in range(1, 5):
	part = f'2Q{i}'
	hidden_states = self.run_transformer(
	hidden_states, update_mask, position_ids, causal_mask, current_pos, part=part
	)
	elif any(part.startswith('2O') for part in self.model_parts): # Octa split
	for i in range(1, 9):
	part = f'2O{i}'
	hidden_states = self.run_transformer(
	hidden_states, update_mask, position_ids, causal_mask, current_pos, part=part
	)
	elif any(part.startswith('2D') for part in self.model_parts): # Dual split
	# Run through both parts of the dual split
	for base_part in ['2D1', '2D2']:
	# Find the correct model key (with lut suffix if present)
	part_key = next(key for key in self.models.keys() if key.startswith(f'{base_part}_') or key == base_part)

	# Use the shared transformer state
	if 'transformer' not in self.states:
	raise ValueError("Transformer state not initialized. Make sure 2D1 is loaded first.")

	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': causal_mask.numpy(),
	'current_pos': current_pos.numpy()
	}
	output_dict = self.models[part_key].predict(inputs, self.states['transformer'])
	hidden_states = torch.from_numpy(output_dict['transformer_output'])

	return hidden_states

	def run_lm_head(self, hidden_states):
	"""Run the LM head model (part 3)."""
	if '3' not in self.models:
	raise ValueError("LM head model (part 3) not loaded")

	output_dict = self.models['3'].predict({
	'hidden_states': hidden_states.numpy()
	})

	# Handle split logits
	logits_parts = []
	for i in range(1, 9): # logits1 through logits8
	logits_key = f'logits{i}'
	if logits_key in output_dict:
	logits_part = torch.from_numpy(output_dict[logits_key])
	logits_parts.append(logits_part)

	# Concatenate along the vocabulary dimension
	return torch.cat(logits_parts, dim=-1)

	def run_full_model(self, input_ids, update_mask, position_ids, causal_mask, current_pos):
	"""Run the full model."""
	if 'full' not in self.models:
	raise ValueError("Full model not loaded")

	# Update context size from global
	self.context_size = CONTEXT_LENGTH

	#kv_ was removed from the input names
	inputs = {
	'input_ids': input_ids.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': causal_mask.numpy(),
	'current_pos': current_pos.numpy()
	}

	# Print shapes of all inputs
	if False:
	print("[DEBUG] Input shapes:")
	for key, value in inputs.items():
	print(f" {key}: {value.shape}")

	output_dict = self.models['full'].predict(inputs, self.states['transformer'])

	# Handle split logits if necessary
	if ENABLE_VACAB_SPLIT8:
	logits_parts = []
	for i in range(1, 9):
	logits_parts.append(output_dict[f'logits{i}'])
	logits = np.concatenate(logits_parts, axis=-1)
	else:
	logits = output_dict['logits']

	return torch.from_numpy(logits)

	def make_causal_mask(length, start):

	# Initialize the mask with -inf
	mask = np.full((1, 1, length, length), -np.inf, dtype=np.float16)

	# Create row and column indices
	row_indices = np.arange(length).reshape(length, 1) # Column vector
	col_indices = np.arange(length).reshape(1, length) # Row vector

	# Set allowed positions to 0 where col_index is within the allowed range of row_index
	mask[:, :, col_indices <= (row_indices + start)] = 0
	return mask

	def initialize_tokenizer(model_path):
	"""Initialize and configure the tokenizer."""
	try:
	print(f"[DEBUG] Loading tokenizer from model path: {model_path}")
	tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

	print("\n[DEBUG] Tokenizer Configuration:")
	print(f"Tokenizer type: {type(tokenizer)}")
	print(f"Tokenizer name: {tokenizer.__class__.__name__}")
	print(f"Vocabulary size: {len(tokenizer)}")
	print(f"Model max length: {tokenizer.model_max_length}")
	#print(f"Chat template: {tokenizer.chat_template if hasattr(tokenizer, 'chat_template') else 'None'}")

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.pad_token_id = tokenizer.eos_token_id
	print("[DEBUG] Set PAD token to EOS token")

	print(f"\n[DEBUG] Special Tokens:")
	print(f"PAD token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
	print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
	print(f"BOS token: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
	print(f"UNK token: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")

	return tokenizer

	except Exception as e:
	print(f"[ERROR] Failed to load tokenizer from {model_path}")
	return None

	class TokenPrinter:
	"""Handles background printing of generated tokens."""
	def __init__(self, tokenizer):
	self.tokenizer = tokenizer
	self.token_queue = queue.Queue()
	self.stop_event = threading.Event()
	self.thread = None
	self.buffer = ""
	self.lock = threading.Lock()
	self.thinking = True # Track if we're still in thinking mode
	self.decoding_buffer = [] # <-- Buffer for token IDs
	self.start()

	def start(self):
	"""Start the printer thread."""
	if self.thread is None:
	self.thread = threading.Thread(target=self._print_worker)
	self.thread.daemon = True
	self.thread.start()

	def add_token(self, token_id):
	"""Add a token to the print queue."""
	if not self.stop_event.is_set():
	self.token_queue.put(token_id)

	def drain_buffer(self):
	"""
	Decode token IDs from self.decoding_buffer in the main thread,
	then print them with the correct color logic.
	"""
	if not self.decoding_buffer:
	return

	# Decode all tokens at once in the main thread.
	token_str = self.tokenizer.decode(self.decoding_buffer)
	self.decoding_buffer.clear()

	# Color-handling logic. Check for "</think>" and handle self.thinking.
	if self.thinking and "</think>" in token_str:
	self.thinking = False
	parts = token_str.split("</think>")
	if len(parts) > 0:
	print(parts[0] + "</think>", end='', flush=True)
	if len(parts) > 1:
	print(LIGHT_BLUE + parts[1], end='', flush=True)
	else:
	if not self.thinking:
	print(LIGHT_BLUE + token_str, end='', flush=True)
	else:
	print(token_str, end='', flush=True)

	def _print_worker(self):
	"""Worker thread that takes token_ids from the queue but doesn't decode."""
	while not self.stop_event.is_set():
	try:
	token_id = self.token_queue.get(timeout=0.01)
	with self.lock:
	# Just store the token_id, decode later on the main thread
	self.decoding_buffer.append(token_id)
	self.token_queue.task_done()
	except queue.Empty:
	continue
	except Exception as e:
	print(f"\n[ERROR] Token printer error: {str(e)}")
	break

	def stop(self):
	"""Stop the printer thread."""
	if self.thread and self.thread.is_alive():
	self.stop_event.set()
	try:
	self.thread.join(timeout=1.0)
	except Exception:
	pass
	print(RESET_COLOR) # Reset color at the end
	return self.buffer

	def parse_coreml_error(error_str):
	"""Parse CoreML error message to extract shape information.

	Args:
	error_str: The error message string from CoreML

	Returns:
	tuple: (got_shape, expected_shape) or None if parsing fails
	"""
	try:
	# Extract shapes from error message using regex
	pattern = r"shape \(([\d\s x]+)\) does not match the shape \(([\d\s x]+)\)"
	match = re.search(pattern, str(error_str))
	if match:
	got_shape = tuple(int(x) for x in match.group(1).split('x'))
	expected_shape = tuple(int(x) for x in match.group(2).split('x'))
	return got_shape, expected_shape
	return None
	except Exception as e:
	print(f"Error parsing CoreML error message: {e}")
	return None

	def handle_coreml_shape_error(e, model_name=""):
	"""Handle CoreML shape mismatch errors with detailed information.

	Args:
	e: The exception object
	model_name: Name of the model for better error reporting
	"""
	error_str = str(e)
	if "MultiArray shape" in error_str:
	shape_info = parse_coreml_error(error_str)
	if shape_info:
	got_shape, expected_shape = shape_info
	print(f"\n[ERROR] Shape mismatch in {model_name}:")
	print(f" Got shape: {' x '.join(str(x) for x in got_shape)}")
	print(f" Expected shape: {' x '.join(str(x) for x in expected_shape)}")
	print("This usually indicates a mismatch between the model's expected context length")
	print("and the actual input being provided.")
	else:
	print(f"\n[ERROR] Shape mismatch error in {model_name}:")
	print(f" {error_str}")
	else:
	print(f"\n[ERROR] CoreML error in {model_name}:")
	print(f" {error_str}")

	def PreFillChunk(model_parts, input_ids, current_pos, context_size, causal_mask, batch_size=64):
	tokens_to_process = current_pos
	batch_pos = 0

	while batch_pos < tokens_to_process:
	batch_end = min(batch_pos + batch_size, tokens_to_process)
	current_batch_size = batch_end - batch_pos

	try:
	# Get current batch of tokens
	batch_input = input_ids[:, batch_pos:batch_end]

	# Pad if needed
	if current_batch_size < batch_size:
	batch_input = F.pad(
	batch_input,
	(0, batch_size - current_batch_size),
	value=0
	)

	# Generate position IDs for this batch
	position_ids = torch.arange(batch_pos, batch_pos + batch_size, dtype=torch.int32)

	# Prepare causal mask for this batch
	multiple_causal_mask = causal_mask[:, :, batch_pos:batch_pos + batch_size, :]

	# Find the correct model key for part 1 (with lut suffix if present)
	part1_key = next(key for key in model_parts.keys() if key.startswith('1_') or key == '1')

	try:
	# Run embeddings (part 1)
	hidden_states = model_parts[part1_key].predict({'input_ids': batch_input.numpy()})['hidden_states']
	hidden_states = torch.from_numpy(hidden_states)
	except Exception as e:
	handle_coreml_shape_error(e, f"embeddings model (part {part1_key})")
	raise

	# Get shared transformer state
	shared_state = model_parts['states']['transformer']

	# Handle different model configurations
	if any(f'{part}_prefill' in model_parts for part in ['2D1S', '2D2S']):
	# S123 mode with prefill/infer functions
	for part in ['2D1S', '2D2S']:
	try:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': multiple_causal_mask.numpy(),
	'start_pos': np.array([batch_pos], dtype=np.int32)
	}
	output = model_parts[f'{part}_prefill'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['dummy_output'])
	except Exception as e:
	handle_coreml_shape_error(e, f"transformer model (part {part})")
	raise
	elif any(part.endswith('S') for part in model_parts if part.startswith('2Q')):
	# Q123S mode with combined quad split
	for i in range(1, 5):
	part = f'2Q{i}S'
	try:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': multiple_causal_mask.numpy(),
	'start_pos': np.array([batch_pos], dtype=np.int32)
	}
	output = model_parts[f'{part}_prefill'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['dummy_output'])
	except Exception as e:
	handle_coreml_shape_error(e, f"transformer model (part {part})")
	raise
	elif any(part.startswith('2Q') for part in model_parts):
	# Q123 mode with quad split
	for i in range(1, 5):
	part = f'2Q{i}'
	if f'{part}_prefill' in model_parts:
	# Use prefill function if available
	try:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': multiple_causal_mask.numpy(),
	'start_pos': np.array([batch_pos], dtype=np.int32)
	}
	output = model_parts[f'{part}_prefill'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['dummy_output'])
	except Exception as e:
	handle_coreml_shape_error(e, f"transformer model (part {part})")
	raise
	else:
	# Use regular predict if no prefill function
	try:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': torch.zeros((1, 1, context_size, 1), dtype=torch.float16).numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': multiple_causal_mask.numpy(),
	'current_pos': position_ids[0].numpy()
	}
	output = model_parts[part].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	except Exception as e:
	handle_coreml_shape_error(e, f"transformer model (part {part})")
	raise
	elif any(key.startswith('2D') for key in model_parts.keys()):
	# 123D mode with dual split (no prefill functions)
	for base_part in ['2D1', '2D2']:
	# Find the correct model key (with lut suffix if present)
	part_key = next(key for key in model_parts.keys() if key.startswith(f'{base_part}_') or key == base_part)
	try:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': torch.zeros((1, 1, context_size, 1), dtype=torch.float16).numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': multiple_causal_mask.numpy(),
	'current_pos': position_ids[0].numpy()
	}
	output = model_parts[part_key].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	except Exception as e:
	handle_coreml_shape_error(e, f"transformer model (part {part_key})")
	raise

	batch_pos = batch_end

	except Exception as e:
	print(f"\n[ERROR] Failed processing batch {batch_pos}-{batch_end}:")
	print(f" {str(e)}")
	raise

	return torch.tensor([current_pos], dtype=torch.int32)

	def PreFillChunkOneByOne(model_parts, input_ids, current_pos, context_size, causal_mask):
	"""Process prefill tokens one at a time using infer function."""
	#print(f"[DEBUG] Starting one-by-one prefill for {current_pos} tokens")

	for pos in range(current_pos):
	# Get current token
	current_token = input_ids[:, pos:pos+1]
	single_causal_mask = causal_mask[:, :, pos:pos+1, :]
	current_pos_tensor = torch.tensor([pos], dtype=torch.int32)

	# Find the correct model key for part 1 (with lut suffix if present)
	part1_key = next(key for key in model_parts.keys() if key.startswith('1_') or key == '1')

	# Run embeddings (part 1)
	hidden_states = torch.from_numpy(model_parts[part1_key].predict({
	'input_ids': current_token.numpy()
	})['hidden_states'])

	#print(f"[DEBUG] pos: {pos} token: {current_token.item()} states: {hidden_states.shape}")

	# Get shared transformer state
	shared_state = model_parts['states']['transformer']

	# Handle different model configurations
	if any(f'{part}_infer' in model_parts for part in ['2D1S', '2D2S']):
	# S123 mode with prefill/infer functions
	for part in ['2D1S', '2D2S']:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': np.zeros((1, 1, context_size, 1), dtype=np.float16),
	'position_ids': current_pos_tensor.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': current_pos_tensor.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(key.startswith('2D') for key in model_parts.keys()):
	# 123D mode or individual parts mode
	for base_part in ['2D1', '2D2']:
	# Find the correct model key (with lut suffix if present)
	part_key = next(key for key in model_parts.keys() if key.startswith(f'{base_part}_') or key == base_part)
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': np.zeros((1, 1, context_size, 1), dtype=np.float16),
	'position_ids': current_pos_tensor.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': current_pos_tensor.numpy()
	}
	output = model_parts[part_key].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])

	return torch.tensor([current_pos], dtype=torch.int32)

	def run_inference(model_parts, tokenizer, prompt, context_size=CONTEXT_LENGTH, num_iterations=5, temperature=0.0):
	"""Run inference using model parts."""
	DebugLog(f"\nPrompt: {prompt}")
	if temperature > 0:
	DebugLog(f"Using temperature: {temperature}")

	# Prepare the prompt
	messages = [{"role": "user", "content": prompt}]
	formatted_input = tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=False
	)
	decoded_input = tokenizer.decode(formatted_input[0])
	DebugLog(f"Decoded input: {decoded_input}")
	DebugLog(f"prompt: {prompt}")
	DebugLog(f"formatted_input size: {formatted_input.size()}")
	DebugLog(f"formatted_input: {formatted_input}")

	base_input_ids = formatted_input.to(torch.int32)
	context_pos = base_input_ids.size(1)
	prompt_tokens = context_pos - 1

	# Pad sequence to context_size
	input_ids = F.pad(
	base_input_ids,
	(0, context_size - context_pos),
	value=0
	)

	DebugLog(f"context_pos (prompt length) = {context_pos}")

	# Create causal mask
	causal_mask = make_causal_mask(context_size, 0)
	causal_mask = torch.tensor(causal_mask, dtype=torch.float16)

	# Prefill phase
	DebugLog("\nStarting prefill...")
	start_time = time.time()

	# Check if we're using 123D mode or individual parts
	use_single_token = any(key.contains('2D') for key in model_parts.keys()) or any(part.contains('2D') for part in model_parts)

	if False: #use_single_token:
	print("\nRunning ST prefill...")
	current_pos = PreFillChunkOneByOne(
	model_parts,
	input_ids,
	context_pos - 1,
	context_size,
	causal_mask
	)
	sequential_prefill_time = time.time() - start_time
	batch_prefll_time = 0.0
	else:
	print("\nRunning batch prefill...")
	current_pos = PreFillChunk(
	model_parts,
	input_ids,
	context_pos - 1,
	context_size,
	causal_mask,
	batch_size=PREFILL_BATCH_SIZE
	)
	batch_prefill_time = time.time() - start_time
	sequential_prefill_time = 0.0

	# Initialize token printer
	token_printer = TokenPrinter(tokenizer)
	print("\nGenerated response:", end=' ', flush=True)

	# Generation loop
	start_gen_time = time.time()
	pos = context_pos - 1

	tokens_generated = 0
	try:
	DebugLog(f"\nStarting inference... context_pos: {context_pos}")
	pos = context_pos
	for step in range(num_iterations):
	with torch.no_grad():
	# Check if we need to shift cache
	if pos >= context_size - 2:
	shift_size = context_size // 4
	new_size = context_size - shift_size

	# Create shifted input_ids and preserve the most recent context
	# Don't add BOS token since this is a continuation
	tmp = torch.zeros((1, context_size), dtype=torch.int32)
	tmp[:,0:new_size] = input_ids[:,shift_size:context_size]
	input_ids = tmp

	# Adjust position after shift
	pos = new_size

	# Create update mask for current position
	update_mask = torch.zeros((1, 1, context_size, 1), dtype=torch.float16)
	update_mask[0, 0, pos-1, 0] = 1.0

	#print(f"\n[DEBUG] Shifted cache by {shift_size} tokens, maintaining context window of {new_size} tokens, new pos: {pos}")

	# For Q123 mode, we need to run prefill on the shifted sequence
	if any(part.startswith('2Q') for part in model_parts):
	# Run prefill using PreFillChunk with proper batch size
	# No need to adjust position since we're not adding BOS
	current_pos = PreFillChunk(
	model_parts,
	input_ids,
	pos-1, # how much ob
	context_size, # Use full context size
	causal_mask,
	batch_size=PREFILL_BATCH_SIZE
	)
	#print(f"[DEBUG] Ran prefill after shift for position {pos} with batch_size={PREFILL_BATCH_SIZE}")
	# Position should already be correct since we didn't add BOS
	pos = current_pos

	# Get current token
	current_token = input_ids[:, pos-1:pos]

	# Find the correct model key for part 1 (with lut suffix if present)
	part1_key = next(key for key in model_parts.keys() if key.startswith('1_') or key == '1')

	# Run embeddings (part 1)
	hidden_states = model_parts[part1_key].predict({
	'input_ids': current_token.numpy()
	})['hidden_states']
	hidden_states = torch.from_numpy(hidden_states)

	# Get shared transformer state
	shared_state = model_parts['states']['transformer']

	# Create update mask for current position
	update_mask = torch.zeros((1, 1, context_size, 1), dtype=torch.float16)
	update_mask[0, 0, pos-1, 0] = 1.0

	# Create position IDs tensor
	position_ids = torch.tensor([pos-1], dtype=torch.int32)

	# Create causal mask for current position
	single_causal_mask = causal_mask[:, :, pos-1:pos, :]

	# Run transformer layers based on model type
	if any(f'{part}_infer' in model_parts for part in ['2D1S', '2D2S']):
	# S123 mode with prefill/infer functions
	for part in ['2D1S', '2D2S']:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(part.startswith('2Q') for part in model_parts.keys()):
	# Q123S mode with combined quad split
	for i in range(1, 5):
	part = f'2Q{i}S'
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(part.startswith('2Q') for part in model_parts):
	# Q123 mode with quad split
	#print(f"[DEBUG] Running quad split inference at position {pos}")
	for i in range(1, 5):
	part = f'2Q{i}'
	if f'{part}_infer' in model_parts:
	# Use infer function if available
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	else:
	# Use regular predict if no infer function
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[part].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(key.startswith('2D') for key in model_parts.keys()):
	# 123D mode or individual parts mode
	for base_part in ['2D1', '2D2']:
	# Find the correct model key (with lut suffix if present)
	part_key = next(key for key in model_parts.keys() if key.startswith(f'{base_part}_') or key == base_part)
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[part_key].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	else:
	print("\n[ERROR] No transformer model parts found!")
	break

	try:
	# Run final layer norm and get logits
	# Find the correct model key for part 3 (with lut suffix if present)
	part3_key = next(key for key in model_parts.keys() if key.startswith('3_') or key == '3')
	output_dict = model_parts[part3_key].predict({
	'hidden_states': hidden_states.numpy()
	})

	if ENABLE_VACAB_SPLIT8:
	# Get all logits parts in a single call
	logits_parts = []
	for i in range(1, 9):
	logits_parts.append(output_dict[f'logits{i}'])
	logits = np.concatenate(logits_parts, axis=-1)
	elif ENABLE_LOGITS2:
	# Get both logits parts in a single call
	logits = np.concatenate([
	output_dict['logits1'],
	output_dict['logits2']
	], axis=-1)
	else:
	logits = output_dict['logits']

	# Convert to tensor and get next token
	logits = torch.from_numpy(logits)

	# Apply temperature if specified
	if temperature > 0:
	# Scale logits by temperature
	logits = logits / temperature
	# Apply softmax to get probabilities
	probs = F.softmax(logits[0, -1, :], dim=-1)
	# Sample from the distribution
	next_token = torch.multinomial(probs, num_samples=1).item()
	else:
	# Use argmax if no temperature
	next_token = torch.argmax(logits[0, -1, :]).item()

	# Add token to input sequence
	input_ids[0, pos] = next_token
	token_printer.add_token(next_token)

	# Safely decode tokens in the main thread
	token_printer.drain_buffer()

	# Update position and count
	pos += 1
	tokens_generated += 1

	if next_token == tokenizer.eos_token_id:
	print("\n[DEBUG] Generated EOS token, stopping...")
	break
	except Exception as e:
	print(f"\n[ERROR] Error in final layer or token generation: {str(e)}")
	break

	except KeyboardInterrupt:
	print("\n[DEBUG] Interrupted by user")
	except Exception as e:
	print(f"\n[ERROR] Exception during inference: {str(e)}")
	print(traceback.format_exc())

	# Print timing statistics
	end_time = time.time()
	total_time = end_time - start_gen_time

	print(f"\n\nTotal time: {total_time:.2f} seconds")
	print(f"Generation tokens: {tokens_generated}")
	print(f"Prefill tokens: {prompt_tokens}")
	print(f"Total tokens (prefill + generation): {prompt_tokens + tokens_generated}")

	if prompt_tokens > 0:
	if batch_prefill_time > 0: # If using batch prefill
	prefill_tokens_per_second = prompt_tokens / batch_prefill_time
	effective_prefill_tokens_per_second = prompt_tokens / batch_prefill_time # Don't multiply by batch size
	print(f"Actual prefill tokens per second: {prefill_tokens_per_second:.2f}")
	print(f"Effective prefill tokens per second (batch={PREFILL_BATCH_SIZE}): {effective_prefill_tokens_per_second:.2f}")
	elif sequential_prefill_time > 0: # If using sequential prefill
	prefill_tokens_per_second = prompt_tokens / sequential_prefill_time
	print(f"Sequential prefill tokens per second: {prefill_tokens_per_second:.2f}")

	if tokens_generated > 0:
	total_processing_time = total_time + (batch_prefill_time if batch_prefill_time > 0 else sequential_prefill_time)
	overall_tokens_per_second = (prompt_tokens + tokens_generated) / total_processing_time
	generation_tokens_per_second = tokens_generated / total_time
	print(f"Overall tokens processed per second (including prefill): {overall_tokens_per_second:.2f}")
	print(f"Generation-only tokens per second: {generation_tokens_per_second:.2f}")

	return token_printer.stop(), {
	'total_time': total_time,
	'batch_prefill_time': batch_prefill_time,
	'sequential_prefill_time': sequential_prefill_time,
	'tokens_generated': tokens_generated,
	'prompt_tokens': prompt_tokens
	}

	def DebugLog(message, always_print=False):
	"""Print debug message if ENABLE_CHAT_DEBUG is True or always_print is True.

	Args:
	message: Message to print
	always_print: If True, print regardless of ENABLE_CHAT_DEBUG setting
	"""
	if ENABLE_CHAT_DEBUG or always_print:
	print(f"[DEBUG] {message}")

	def chat_loop(model_parts, tokenizer, context_size=CONTEXT_LENGTH, temperature=0.0):
	"""Interactive chat loop that maintains conversation history."""
	print("\nStarting chat session. Press Ctrl+D to exit.")
	print("Type your message and press Enter to chat.")

	DebugLog(f"Using context size: {context_size}")
	DebugLog(f"Temperature: {temperature}")
	DebugLog(f"Model parts loaded: {list(model_parts.keys())}")

	# Initialize conversation history
	conversation = []
	input_ids = None
	current_pos = 0

	try:
	while True:
	try:
	print(f"\n{LIGHT_GREEN}You:{RESET_COLOR}", end=' ', flush=True)
	user_input = input().strip()
	except EOFError:
	print("\nExiting chat...")
	break

	if not user_input:
	continue

	# Add user message to conversation
	conversation.append({"role": "user", "content": user_input})

	DebugLog("\nFormatting conversation:")
	for msg in conversation:
	DebugLog(f" {msg['role']}: {msg['content'][:50]}...")

	# Format entire conversation
	formatted_input = tokenizer.apply_chat_template(
	conversation,
	return_tensors="pt",
	add_generation_prompt=True
	)

	DebugLog("\nTokenization:")
	DebugLog(f"Input token IDs: {formatted_input[0][:50]}...")
	DebugLog(f"Decoded tokens: {tokenizer.decode(formatted_input[0][:50])}...")
	DebugLog(f"Total tokens: {formatted_input.size(1)}")

	# Convert to int32 tensor
	base_input_ids = formatted_input.to(torch.int32)
	context_pos = base_input_ids.size(1)

	DebugLog(f"Context position: {context_pos}")

	# Check if we need to truncate history
	if context_pos >= context_size - 100:
	DebugLog(f"\nNeed to truncate: {context_pos} tokens > {context_size-100} limit")
	while context_pos >= context_size - 100 and len(conversation) > 2:
	removed = conversation.pop(0)
	DebugLog(f"Removed message: {removed['role']}: {removed['content'][:30]}...")
	formatted_input = tokenizer.apply_chat_template(
	conversation,
	return_tensors="pt",
	add_generation_prompt=True
	)
	base_input_ids = formatted_input.to(torch.int32)
	context_pos = base_input_ids.size(1)
	DebugLog(f"New context size: {context_pos}")

	# Pad sequence to context_size
	input_ids = F.pad(
	base_input_ids,
	(0, context_size - context_pos),
	value=0
	)

	# Create causal mask for the entire context
	causal_mask = make_causal_mask(context_size, 0)
	causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
	DebugLog(f"Created causal mask with shape: {causal_mask.shape}")

	print(f"\n{LIGHT_BLUE}Assistant:{RESET_COLOR}", end=' ', flush=True)

	# Run prefill on entire context
	if False: #any(key.contains('2D') for key in model_parts.keys()):
	DebugLog("Using sequential prefill")
	current_pos = PreFillChunkOneByOne(
	model_parts,
	input_ids,
	context_pos,
	context_size,
	causal_mask
	)
	elif any(part.startswith('2Q') for part in model_parts.keys()):
	DebugLog(f"Using quad split prefill (size={PREFILL_BATCH_SIZE})")
	current_pos = PreFillChunk(
	model_parts,
	input_ids,
	context_pos,
	context_size,
	causal_mask,
	batch_size=PREFILL_BATCH_SIZE
	)
	else:
	DebugLog(f"Using standard batch prefill (size={PREFILL_BATCH_SIZE})")
	current_pos = PreFillChunk(
	model_parts,
	input_ids,
	context_pos,
	context_size,
	causal_mask,
	batch_size=PREFILL_BATCH_SIZE
	)

	# Initialize token printer
	token_printer = TokenPrinter(tokenizer)

	# Generation loop
	pos = context_pos
	response_tokens = []
	generation_start_time = time.time() # Add timing

	try:
	while True: # Changed from context_size - 1 to True for continuous generation
	# Check if we need to shift window
	if pos >= context_size - 2:
	DebugLog("\nShifting context window...")

	shift_size = context_size // 4 # Shift by 1/4 of context
	new_size = context_size - shift_size

	# Create shifted input_ids and preserve the most recent context
	tmp = torch.zeros((1, context_size), dtype=torch.int32)
	tmp[:,0:new_size] = input_ids[:,shift_size:context_size]
	input_ids = tmp

	# Adjust position after shift
	pos = new_size

	DebugLog(f"Shifted window by {shift_size} tokens, new position: {pos}")

	# Run prefill on the shifted sequence
	if False: #if any(key.contains('2D') for key in model_parts.keys()):
	DebugLog("Running sequential prefill after shift")
	current_pos = PreFillChunkOneByOne(
	model_parts,
	input_ids,
	pos,
	context_size,
	causal_mask
	)
	else:
	DebugLog("Running batch prefill after shift (size={PREFILL_BATCH_SIZE})")
	current_pos = PreFillChunk(
	model_parts,
	input_ids,
	pos,
	context_size,
	causal_mask,
	batch_size=PREFILL_BATCH_SIZE
	)

	# Get current token
	current_token = input_ids[:, pos-1:pos]

	# Find the correct model key for part 1
	part1_key = next(key for key in model_parts.keys() if key.startswith('1_') or key == '1')

	# Run embeddings (part 1)
	hidden_states = model_parts[part1_key].predict({
	'input_ids': current_token.numpy()
	})['hidden_states']
	hidden_states = torch.from_numpy(hidden_states)

	# Get shared transformer state
	shared_state = model_parts['states']['transformer']

	# Create update mask for current position
	update_mask = torch.zeros((1, 1, context_size, 1), dtype=torch.float16)
	update_mask[0, 0, pos-1, 0] = 1.0

	# Create position IDs tensor
	position_ids = torch.tensor([pos-1], dtype=torch.int32)

	# Create causal mask for current position
	single_causal_mask = causal_mask[:, :, pos-1:pos, :]

	# Run transformer layers based on model type
	if any(f'{part}_infer' in model_parts for part in ['2D1S', '2D2S']):
	for part in ['2D1S', '2D2S']:
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(part.startswith('2Q') for part in model_parts.keys()):
	DebugLog(f"Running quad split inference at position {pos}")
	for i in range(1, 5):
	part = f'2Q{i}'
	if f'{part}_infer' in model_parts:
	# Use infer function if available
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[f'{part}_infer'].predict(inputs, shared_state)
	else:
	# Use regular predict if no infer function
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[part].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])
	elif any(key.startswith('2D') for key in model_parts.keys()):
	for base_part in ['2D1', '2D2']:
	part_key = next(key for key in model_parts.keys() if key.startswith(f'{base_part}_') or key == base_part)
	inputs = {
	'hidden_states': hidden_states.numpy(),
	'update_mask': update_mask.numpy(),
	'position_ids': position_ids.numpy(),
	'causal_mask': single_causal_mask.numpy(),
	'current_pos': position_ids.numpy()
	}
	output = model_parts[part_key].predict(inputs, shared_state)
	hidden_states = torch.from_numpy(output['transformer_output'])

	# Run final layer norm and get logits
	part3_key = next(key for key in model_parts.keys() if key.startswith('3_') or key == '3')
	output_dict = model_parts[part3_key].predict({
	'hidden_states': hidden_states.numpy()
	})

	if ENABLE_VACAB_SPLIT8:
	logits_parts = []
	for i in range(1, 9):
	logits_parts.append(output_dict[f'logits{i}'])
	logits = np.concatenate(logits_parts, axis=-1)
	else:
	logits = output_dict['logits']

	# Convert to tensor and get next token
	logits = torch.from_numpy(logits)

	# Apply temperature if specified
	if temperature > 0:
	logits = logits / temperature
	probs = F.softmax(logits[0, -1, :], dim=-1)
	next_token = torch.multinomial(probs, num_samples=1).item()
	else:
	next_token = torch.argmax(logits[0, -1, :]).item()

	# Add token to input sequence and response
	input_ids[0, pos] = next_token
	response_tokens.append(next_token)
	token_printer.add_token(next_token)

	# Safely decode tokens in the main thread
	token_printer.drain_buffer()

	pos += 1

	# Add debug output for generated tokens
	if ENABLE_CHAT_DEBUG and len(response_tokens) > 0 and len(response_tokens) % 10 == 0:
	DebugLog(f"\nGenerated {len(response_tokens)} tokens")
	DebugLog(f"Last token: {next_token} -> '{tokenizer.decode([next_token])}'")

	if next_token == tokenizer.eos_token_id:
	DebugLog("\nGenerated EOS token")
	break

	# Get the complete response text and calculate stats
	response_text = token_printer.stop()
	generation_time = time.time() - generation_start_time
	tokens_per_second = len(response_tokens) / generation_time if generation_time > 0 else 0

	DebugLog(f"\nFinal response length: {len(response_tokens)} tokens")

	# Print generation stats in dark blue
	print(f"\n{DARK_BLUE}[{len(response_tokens)} tokens, {tokens_per_second:.1f} tokens/s]{RESET_COLOR}")

	except KeyboardInterrupt:
	DebugLog("\nGeneration interrupted by user")
	response_text = token_printer.stop()
	generation_time = time.time() - generation_start_time
	tokens_per_second = len(response_tokens) / generation_time if generation_time > 0 else 0
	print(f"\n{DARK_BLUE}[{len(response_tokens)} tokens, {tokens_per_second:.1f} tokens/s]{RESET_COLOR}")

	# Add assistant's response to conversation history
	conversation.append({"role": "assistant", "content": response_text})

	except Exception as e:
	print(f"\n[ERROR] Chat loop error: {str(e)}")
	print(traceback.format_exc())

	def main():
	global CONTEXT_LENGTH, PREFILL_BATCH_SIZE, MODEL_PATH

	print("ANEMLL Chat. Pre-relase alpha version, 2025-01-31")
	print("Copyright (c) 2025, Anemll All rights reserved.")
	# Set default parameters
	model_type = "Q123" # Default model type
	lut_suffix = "lut4" # Default LUT suffix
	temperature = 0.0
	model_parts = {}
	model_path = "." # Default to current directory

	if len(sys.argv) < 2:
	print("Usage: python chat.py [model_parts] [options]")
	print("Usage: python chat.py [model_parts] [options]")
	print("\nOptions:")
	print(" -d PATH # Model directory path (for both tokenizer and CoreML models)")
	print(" S123 # Combined split model (2D1S+2D2S)")
	print(" C123 # Combined part2 model with prefill/infer")
	print(" Q123 # Quad split model (2Q1-2Q4) [default]")
	print(" Q123S # Combined quad split model (2Q1S-2Q4S)")
	print(" 1 2D1 2D2 3 # Individual split parts")
	print(" pfN # Prefill batch size (e.g., pf128)")
	print(" ctx=N # Context length (e.g., ctx=2048) [default: 1024]")
	print(" temp=X # Temperature for sampling (e.g., temp=0.01)")
	print(" lut4 # LUT suffix [default]")
	print("\nDefault configuration: Q123 lut4 ctx=1024")
	print(" python chat.py Q123 -d ../anemll-DeepSeek-8B-ctx1024")
	# Use defaults instead of exiting
	print("\nUsing default configuration...")
	else:
	# Process command line arguments
	i = 1
	while i < len(sys.argv):
	if sys.argv[i] == '-d' and i + 1 < len(sys.argv):
	model_path = sys.argv[i + 1]
	i += 2
	# Extract context length from model path if present
	ctx_match = re.search(r'ctx(\d+)', model_path)
	if ctx_match:
	ctx_value = int(ctx_match.group(1))
	if 512 <= ctx_value <= 4096*2:
	CONTEXT_LENGTH = ctx_value
	print(f"Setting context length to {CONTEXT_LENGTH} from model path")
	continue
	elif sys.argv[i].startswith('lut'):
	lut_suffix = sys.argv[i]
	elif sys.argv[i] in ['S123', 'Q123', 'Q123S', 'C123', '123D']:
	model_type = sys.argv[i]
	i += 1

	# Initialize tokenizer using the same path
	tokenizer = initialize_tokenizer(model_path)
	if tokenizer is None:
	print("[ERROR] Failed to initialize tokenizer. Exiting.")
	return

	# Process model parts
	parts = [model_type]
	if lut_suffix:
	parts.append(lut_suffix)

	try:
	split_model = SplitModelInference(parts, model_dir=model_path)
	model_parts.update(split_model.models)
	model_parts['states'] = {'transformer': split_model.states['transformer']}
	except Exception as e:
	print(f"Error loading model parts: {str(e)}")
	return

	# Process remaining arguments
	i = 1
	while i < len(sys.argv):
	arg = sys.argv[i]
	if arg.startswith('pf') and arg[2:].isdigit():
	PREFILL_BATCH_SIZE = int(arg[2:])
	elif arg.startswith('ctx='):
	try:
	CONTEXT_LENGTH = int(arg.split('=')[1])
	except (IndexError, ValueError):
	print(f"[WARNING] Invalid context length format. Using default: {CONTEXT_LENGTH}")
	elif arg.startswith('temp='):
	try:
	temperature = float(arg.split('=')[1])
	if temperature < 0:
	print(f"[WARNING] Temperature must be non-negative. Using default: 0.0")
	temperature = 0.0
	except (IndexError, ValueError):
	print(f"[WARNING] Invalid temperature format. Using default: 0.0")
	i += 1

	try:
	# Start interactive chat loop
	chat_loop(model_parts, tokenizer, context_size=CONTEXT_LENGTH, temperature=temperature)
	except Exception as e:
	print("An error occurred:")
	print(traceback.format_exc())

	if __name__ == "__main__":
	main()