Spaces:

notmanoj
/

chronos

Paused

App Files Files Community

Manoj Kumar commited on 28 days ago

Commit

e6f4fec

1 Parent(s): 867cb42

Mark POhase 1

Browse files

Files changed (8) hide show

.DS_Store +0 -0
Mark-1/db_creation.py +91 -0
Mark-1/phase1.py +274 -0
db.py +1 -1
ecommerce.db +0 -0
requirements.txt +4 -1
wikiPreTrained.py +121 -0
wikiSQL.py +199 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Mark-1/db_creation.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import sqlite3
+import random
+from faker import Faker
+# Initialize Faker for generating random data
+fake = Faker()
+# Define custom schema
+custom_schema = {
+    "products": {
+        "columns": ["product_id INTEGER PRIMARY KEY", "name TEXT", "price REAL", "category_id INTEGER"],
+        "relations": ["category_id -> categories.id"],
+    },
+    "categories": {
+        "columns": ["id INTEGER PRIMARY KEY", "category_name TEXT"],
+        "relations": None,
+    },
+    "orders": {
+        "columns": ["order_id INTEGER PRIMARY KEY", "user_id INTEGER", "product_id INTEGER", "order_date TEXT"],
+        "relations": ["product_id -> products.product_id", "user_id -> users.user_id"],
+    },
+    "users": {
+        "columns": [
+            "user_id INTEGER PRIMARY KEY",
+            "first_name TEXT",
+            "last_name TEXT",
+            "email TEXT UNIQUE",
+            "phone_number TEXT",
+            "address TEXT"
+        ],
+        "relations": None,
+    }
+}
+# Connect to SQLite database
+conn = sqlite3.connect("ecommerce.db")
+cursor = conn.cursor()
+# Function to create tables based on schema
+def create_tables():
+    for table_name, table_data in custom_schema.items():
+        columns = ", ".join(table_data["columns"])
+        table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({columns}"
+        if table_data["relations"]:
+            for relation in table_data["relations"]:
+                col_name, ref_table = relation.split(" -> ")
+                ref_col = ref_table.split(".")[1]
+                ref_table_name = ref_table.split(".")[0]
+                table_sql += f", FOREIGN KEY({col_name}) REFERENCES {ref_table_name}({ref_col})"
+        table_sql += ");"
+        cursor.execute(table_sql)
+# Function to populate categories table
+def insert_categories():
+    categories = [(i, fake.word().capitalize() + " " + fake.word().capitalize()) for i in range(1, 1001)]
+    cursor.executemany("INSERT INTO categories (id, category_name) VALUES (?, ?)", categories)
+    return categories
+# Function to populate products table
+def insert_products(categories):
+    products = [(i, fake.company() + " " + fake.word().capitalize(), round(random.uniform(10, 1000), 2), random.choice(categories)[0]) for i in range(1, 1001)]
+    cursor.executemany("INSERT INTO products (product_id, name, price, category_id) VALUES (?, ?, ?, ?)", products)
+    return products
+# Function to populate users table
+def insert_users():
+    users = [(i, fake.first_name(), fake.last_name(), fake.email(), fake.phone_number(), fake.address()) for i in range(1, 1001)]
+    cursor.executemany("INSERT OR IGNORE INTO users (user_id, first_name, last_name, email, phone_number, address) VALUES (?, ?, ?, ?, ?, ?)", users)
+    return users
+# Function to populate orders table
+def insert_orders(users, products):
+    orders = [(i, random.choice(users)[0], random.choice(products)[0], fake.date_this_year().strftime("%Y-%m-%d")) for i in range(1, 1001)]
+    cursor.executemany("INSERT INTO orders (order_id, user_id, product_id, order_date) VALUES (?, ?, ?, ?)", orders)
+# Create tables
+create_tables()
+# Insert data into tables
+categories = insert_categories()
+products = insert_products(categories)
+users = insert_users()
+insert_orders(users, products)
+# Commit and close connection
+conn.commit()
+conn.close()
+print("1000 rows inserted into each table successfully!")

Mark-1/phase1.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import sqlite3
+import spacy
+import re
+from thefuzz import process
+import numpy as np
+from transformers import pipeline
+# Load intent classification model
+# Use Hugging Face's zero-shot pipeline for flexibility
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+nlp = spacy.load("en_core_web_sm")
+nlp_vectors = spacy.load("en_core_web_md")
+# Define operator mappings
+operator_mappings = {
+    "greater than": ">",
+    "less than": "<",
+    "equal to": "=",
+    "not equal to": "!=",
+    "starts with": "LIKE",
+    "ends with": "LIKE",
+    "contains": "LIKE",
+    "above": ">",
+    "below": "<",
+    "more than": ">",
+    "less than": "<",
+    "<": "<",
+    ">": ">"
+}
+# Connect to SQLite database
+def connect_to_db(db_path):
+    conn = sqlite3.connect(db_path)
+    return conn
+# Fetch database schema
+def fetch_schema(conn):
+    cursor = conn.cursor()
+    query = """
+    SELECT name
+    FROM sqlite_master
+    WHERE type='table';
+    """
+    cursor.execute(query)
+    tables = cursor.fetchall()
+    schema = {}
+    for table in tables:
+        table_name = table[0]
+        cursor.execute(f"PRAGMA table_info({table_name});")
+        columns = cursor.fetchall()
+        schema[table_name] = [{"name": col[1], "type": col[2], "not_null": col[3], "default": col[4], "pk": col[5]} for col in columns]
+    return schema
+def find_ai_synonym(token_text, table_schema):
+    """Return the best-matching column from table_schema based on vector similarity."""
+    token_vec = nlp_vectors(token_text)[0].vector
+    best_col = None
+    best_score = 0.0
+    for col in table_schema:
+        col_vec = nlp_vectors(col)[0].vector
+        # Cosine similarity
+        score = token_vec.dot(col_vec) / (np.linalg.norm(token_vec) * np.linalg.norm(col_vec))
+        if score > best_score:
+            best_score = score
+            best_col = col
+    # Apply threshold
+    if best_score > 0.65:
+        return best_col
+    return None
+def identify_table(question, schema_tables):
+    # schema_tables = ["products", "users", "orders", ...]
+    table, score = process.extractOne(question, schema_tables)
+    if score > 80:  # a comfortable threshold
+        return table
+    return None
+def identify_columns(question, columns_for_table):
+    # columns_for_table = ["id", "price", "stock", "name", ...]
+    # For each token in question, fuzzy match to columns
+    matched_cols = []
+    tokens = question.lower().split()
+    for token in tokens:
+        col, score = process.extractOne(token, columns_for_table)
+        if score > 80:
+            matched_cols.append(col)
+    return matched_cols
+def find_closest_column(token, table_schema):
+    # table_schema is a list of column names, e.g. ["price", "stock", "name"]
+    # This returns (best_match, score)
+    best_match, score = process.extractOne(token, table_schema)
+    # You can tune this threshold as needed (e.g. 70, 80, etc.)
+    if score > 90:
+        return best_match
+    return None
+# Condition extraction with NLP
+def extract_conditions(question, schema, table):
+    table_schema = [col["name"].lower() for col in schema.get(table, [])]
+    # Detect whether the user used 'AND' / 'OR'
+    # (case-insensitive, hence .lower() checks)
+    use_and = " and " in question.lower()
+    use_or = " or " in question.lower()
+    last_column = None
+    # Split on 'and' or 'or' to handle multiple conditions
+    condition_parts = re.split(r'\band\b|\bor\b', question, flags=re.IGNORECASE)
+    print(condition_parts)
+    conditions = []
+    for part in condition_parts:
+        part = part.strip()
+        # Use spaCy to tokenize each part
+        doc = nlp(part.lower())
+        tokens = [token.text for token in doc]
+        # Skip the recognized_table token if it appears in tokens
+        # so it won't be matched as a column
+        tokens = [t for t in tokens if t != table.lower()]
+        part_conditions = []
+        current_part_column = None
+        print(tokens)
+        for i, token in enumerate(tokens):
+            # Try synonyms/fuzzy, etc. to find a column
+            possible_col = find_ai_synonym(token, table_schema)
+            if possible_col:
+                current_part_column = possible_col
+                last_column = possible_col  # update last_column
+        # Check for any matching operator phrase in this part
+        for phrase, sql_operator in operator_mappings.items():
+            if phrase in part.lower():
+                # Extract the value after the phrase
+                value_index = part.lower().find(phrase) + len(phrase)
+                value = part[value_index:].strip().split(" ")[0]
+                value = value.replace("'", "").replace('"', "").strip()
+                # Special handling for LIKE operators
+                if sql_operator == "LIKE":
+                    if "starts with" in phrase:
+                        value = f"'{value}%'"
+                    elif "ends with" in phrase:
+                        value = f"'%{value}'"
+                    elif "contains" in phrase:
+                        value = f"'%{value}%'"
+                # If we did not find a new column, fallback to last_column
+                column_to_use = current_part_column or last_column
+                if column_to_use:
+                    # Add this condition to the list for this part
+                    part_conditions.append(f"{column_to_use} {sql_operator} {value}")
+        # If multiple conditions are found in this part, join them with AND
+        # (e.g., "price > 100 AND stock < 50" within the same part)
+        if part_conditions:
+            conditions.append(" AND ".join(part_conditions))
+    # Finally, combine each part with AND or OR, depending on the user query
+    if use_and:
+        return " AND ".join(conditions)
+    elif use_or:
+        return " OR ".join(conditions)
+    else:
+        # If there's only one part or no explicit 'and'/'or', default to AND
+        return " AND ".join(conditions)
+# Interpret user question using intent recognition
+def interpret_question(question, schema):
+    # Define potential intents
+    intents = {
+        "describe_table": "Provide information about the columns and structure of a table.",
+        "list_table_data": "Fetch and display all data stored in a table.",
+        "count_records": "Count the number of records in a table.",
+        "fetch_column": "Fetch a specific column's data from a table."
+    }
+    # Use classifier to predict intent
+    labels = list(intents.keys())
+    result = classifier(question, labels)
+    predicted_intent = result["labels"][0]
+    table = identify_table(question, list(schema.keys()))
+    # Rule-based fallback for conditional queries
+    condition_keywords = list(operator_mappings.keys())
+    if any(keyword in question.lower() for keyword in condition_keywords):
+        predicted_intent = "list_table_data"
+    return {"intent": predicted_intent, "table": table}
+# Handle different intents
+def handle_intent(intent_data, schema, conn, question):
+    intent = intent_data["intent"]
+    table = intent_data["table"]
+    if not table:
+        return "I couldn't identify which table you're referring to."
+    if intent == "describe_table":
+        # Describe table structure
+        table_schema = schema[table]
+        description = [f"Table '{table}' has the following columns:"]
+        for col in table_schema:
+            col_details = f"- {col['name']} ({col['type']})"
+            if col['not_null']:
+                col_details += " [NOT NULL]"
+            if col['default'] is not None:
+                col_details += f" [DEFAULT: {col['default']}]"
+            if col['pk']:
+                col_details += " [PRIMARY KEY]"
+            description.append(col_details)
+        return "\n".join(description)
+    elif intent == "list_table_data":
+        # Check for conditions
+        condition = extract_conditions(question, schema, table)
+        cursor = conn.cursor()
+        query = f"SELECT * FROM {table}"
+        if condition:
+            query += f" WHERE {condition};"
+        else:
+            query += ";"
+        print(query)
+        cursor.execute(query)
+        return cursor.fetchall()
+    elif intent == "count_records":
+        # Count records in the table
+        cursor = conn.cursor()
+        cursor.execute(f"SELECT COUNT(*) FROM {table};")
+        return cursor.fetchone()
+    elif intent == "fetch_column":
+        return "Fetching specific column data is not yet implemented."
+    else:
+        return "I couldn't understand your question."
+# Main function
+def answer_question(question, conn, schema):
+    intent_data = interpret_question(question, schema)
+    print(intent_data)
+    return handle_intent(intent_data, schema, conn, question)
+# Example Usage
+if __name__ == "__main__":
+    db_path = "./ecommerce.db"  # Replace with your SQLite database path
+    conn = connect_to_db(db_path)
+    schema = fetch_schema(conn)
+    print("Schema:", schema)
+    while True:
+        question = input("\nAsk a question about the database: ")
+        if question.lower() in ["exit", "quit"]:
+            break
+        answer = answer_question(question, conn, schema)
+        print("Answer:", answer)

db.py CHANGED Viewed

@@ -34,7 +34,7 @@ def generate_context(schema):
 schema_context = generate_context(schema)
 # Step 2: Load the T5-base-text-to-sql model
-model_name = "mrm8488/t5-base-finetuned-wikiSQL"  # A model fine-tuned for SQL generation
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

 schema_context = generate_context(schema)
 # Step 2: Load the T5-base-text-to-sql model
+model_name = "suriya7/t5-base-text-to-sql"  # A model fine-tuned for SQL generation
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

ecommerce.db ADDED Viewed

Binary file (258 kB). View file

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 transformers
 torch
 accelerate>=0.26.0
-tiktoken

 transformers
 torch
 accelerate>=0.26.0
+tiktoken
+datasets
+sentencepiece
+tqdm

wikiPreTrained.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import torch
+import re
+# Load the trained model and tokenizer
+model = T5ForConditionalGeneration.from_pretrained("./t5_sql_finetuned")
+tokenizer = T5Tokenizer.from_pretrained("./t5_sql_finetuned")
+# Define a simple function to check if the question is schema-related or SQL-related
+def is_schema_question(question: str):
+    schema_keywords = ["columns", "tables", "structure", "schema", "relations", "fields"]
+    return any(keyword in question.lower() for keyword in schema_keywords)
+# Helper function to extract table name from the question
+def extract_table_name(question: str):
+    # Regex pattern to find table names, assuming table names are capitalized or match a known pattern
+    table_name_match = re.search(r'for (\w+)|in (\w+)|from (\w+)', question)
+    if table_name_match:
+        # Return the matched table name (first capturing group)
+        return table_name_match.group(1) or table_name_match.group(2) or table_name_match.group(3)
+    # If no table name is detected, return None
+    return None
+# Define a function to handle SQL generation
+def generate_sql(question: str, schema: dict, model, tokenizer, device):
+    # Preprocess the question for SQL generation (e.g., reformat)
+    # Example question: "What is the price of the product with ID 123?"
+    # Here we use the model to generate SQL query
+    inputs = tokenizer(question, return_tensors="pt")
+    input_ids = inputs.input_ids.to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(input_ids, max_length=128)
+    # Decode the SQL query generated by the model
+    sql_query = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+    return sql_query
+# Define a function to handle schema-related questions
+def handle_schema_question(question: str, schema: dict):
+    # Here you handle questions about the schema (tables, columns, relations)
+    # Example schema-related question: "What columns does the products table have?"
+    question = question.lower()
+    # Check if the question asks about columns
+    if "columns" in question or "fields" in question:
+        table_name = extract_table_name(question)
+        if table_name:
+            if table_name in schema:
+                return schema[table_name]["columns"]
+            else:
+                return f"Table '{table_name}' not found in the schema."
+    # Check if the question asks about relations
+    elif "relations" in question or "relationships" in question:
+        table_name = extract_table_name(question)
+        if table_name:
+            if table_name in schema:
+                return schema[table_name]["relations"]
+            else:
+                return f"Table '{table_name}' not found in the schema."
+    # Additional cases can be handled here (e.g., "Which tables are in the schema?")
+    elif "tables" in question:
+        return list(schema.keys())
+    # If the question is too vague or doesn't match the expected patterns
+    return "Sorry, I couldn't understand your schema question. Could you rephrase?"
+# Example schema for your custom use case
+custom_schema = {
+    "products": {
+        "columns": ["product_id", "name", "price", "category_id"],
+        "relations": "category_id -> categories.id",
+    },
+    "categories": {
+        "columns": ["id", "category_name"],
+        "relations": None,
+    },
+    "orders": {
+        "columns": ["order_id", "user_id", "product_id", "order_date"],
+        "relations": ["product_id -> products.product_id", "user_id -> users.user_id"],
+    },
+    "users": {
+         "columns": ["user_id", "first_name", "last_name", "email", "phone_number", "address"],
+        "relations": None,
+    }
+}
+def answer_question(question: str, schema: dict, model, tokenizer, device):
+    # First, check if the question is about the schema or SQL
+    if is_schema_question(question):
+        # Handle schema-related questions
+        response = handle_schema_question(question, schema)
+        return f"Schema Information: {response}"
+    else:
+        # Generate an SQL query for data-related questions
+        sql_query = generate_sql(question, schema, model, tokenizer, device)
+        return f"Generated SQL Query: {sql_query}"
+# Example input questions
+question_1 = "What columns does the products table have?"
+question_2 = "What is the price of the product with product_id 123?"
+# Assuming you have loaded your model and tokenizer as `model` and `tokenizer`
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Handle schema question
+response_1 = answer_question(question_1, custom_schema, model, tokenizer, device)
+print(response_1)  # This should give you the columns of the products table
+# Handle SQL query question
+response_2 = answer_question(question_2, custom_schema, model, tokenizer, device)
+print(response_2)  # This should generate an SQL query for fetching the price

wikiSQL.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import json
+import os
+import torch
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+)
+from torch.utils.data import DataLoader
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def load_table_schemas(tables_file):
+    """
+    Load table schemas from the tables.jsonl file.
+    Args:
+        tables_file: Path to the tables.jsonl file.
+    Returns:
+        A dictionary mapping table IDs to their column names.
+    """
+    table_schemas = {}
+    with open(tables_file, 'r') as f:
+        for line in f:
+            table_data = json.loads(line)
+            table_id = table_data["id"]
+            table_columns = table_data["header"]
+            table_schemas[table_id] = table_columns
+    return table_schemas
+# Step 1: Load and Preprocess WikiSQL Data
+def load_wikisql(data_dir):
+    """
+    Load WikiSQL data and prepare it for training.
+    Args:
+        data_dir: Path to the WikiSQL dataset directory.
+    Returns:
+        List of examples with input and target text.
+    """
+    def parse_file(file_path):
+        with open(file_path, 'r') as f:
+            return [json.loads(line) for line in f]
+    tables_data = parse_file(os.path.join(data_dir, "train.tables.jsonl"))
+    train_data = parse_file(os.path.join(data_dir, "train.jsonl"))
+    dev_data = parse_file(os.path.join(data_dir, "dev.jsonl"))
+    print("====>", train_data[0])
+    tables_file = "./data/train.tables.jsonl"
+    table_schemas = load_table_schemas(tables_file)
+    dev_tables = './data/dev.tables.jsonl'
+    dev_tables_schema = load_table_schemas(dev_tables)
+    def format_data(data, type):
+        formatted = []
+        for item in data:
+            table_id = item["table_id"]
+            table_columns = table_schemas[table_id] if type == 'train' else dev_tables_schema[table_id]
+            question = item["question"]
+            sql = item["sql"]
+            sql_query = sql_to_text(sql, table_columns)
+            print("SQL Query", sql_query)
+            formatted.append({"input": f"Question: {question}", "target": sql_query})
+        return formatted
+    return format_data(train_data, "train"), format_data(dev_data, "dev")
+def sql_to_text(sql, table_columns):
+    """
+    Convert SQL dictionary from WikiSQL to text representation.
+    Args:
+        sql: SQL dictionary from WikiSQL (e.g., {"sel": 5, "conds": [[3, 0, "value"]], "agg": 0}).
+        table_columns: List of column names corresponding to the table.
+    Returns:
+        SQL query as a string.
+    """
+    # Aggregation functions mapping
+    agg_functions = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
+    operators = ["=", ">", "<"]
+    # Get selected column
+    sel_column = table_columns[sql["sel"]]
+    agg_func = agg_functions[sql["agg"]]
+    select_clause = f"SELECT {agg_func}({sel_column})" if agg_func else f"SELECT {sel_column}"
+    # Get conditions
+    if sql["conds"]:
+        conditions = []
+        for cond in sql["conds"]:
+            col_idx, operator, value = cond
+            col_name = table_columns[col_idx]
+            conditions.append(f"{col_name} {operators[operator]} '{value}'")
+        where_clause = " WHERE " + " AND ".join(conditions)
+    else:
+        where_clause = ""
+    # Combine clauses into a full query
+    return select_clause + where_clause
+# Step 2: Tokenize the Data
+def tokenize_data(data, tokenizer, max_length=128):
+    """
+    Tokenize the input and target text.
+    Args:
+        data: List of examples with "input" and "target".
+        tokenizer: Pretrained tokenizer.
+        max_length: Maximum sequence length for the model.
+    Returns:
+        Tokenized dataset.
+    """
+    inputs = [item["input"] for item in data]
+    targets = [item["target"] for item in data]
+    tokenized = tokenizer(
+        inputs,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    labels = tokenizer(
+        targets,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    tokenized["labels"] = labels["input_ids"]
+    return tokenized
+# Step 3: Load Model and Tokenizer
+model_name = "t5-small"  # Use "t5-small", "t5-base", or "t5-large"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Step 4: Prepare Training and Validation Data
+data_dir = "data"  # Path to the WikiSQL dataset
+train_data, dev_data = load_wikisql(data_dir)
+# Tokenize Data
+train_dataset = tokenize_data(train_data, tokenizer)
+dev_dataset = tokenize_data(dev_data, tokenizer)
+# # Convert to Hugging Face Dataset format
+train_dataset = Dataset.from_dict(train_dataset)
+dev_dataset = Dataset.from_dict(dev_dataset)
+# # # Step 5: Define Training Arguments
+# training_args = Seq2SeqTrainingArguments(
+#     output_dir="./t5_sql_finetuned",
+#     evaluation_strategy="steps",
+#     save_steps=1000,
+#     eval_steps=100,
+#     logging_steps=100,
+#     per_device_train_batch_size=16,
+#     per_device_eval_batch_size=16,
+#     num_train_epochs=3,
+#     save_total_limit=2,
+#     learning_rate=5e-5,
+#     predict_with_generate=True,
+#     fp16=torch.cuda.is_available(),  # Enable mixed precision for faster training
+#     logging_dir="./logs",
+# )
+# # # Step 6: Define Trainer
+# trainer = Seq2SeqTrainer(
+#     model=model,
+#     args=training_args,
+#     train_dataset=train_dataset,
+#     eval_dataset=dev_dataset,
+#     tokenizer=tokenizer,
+# )
+# # # Step 7: Train the Model
+# trainer.train()
+# # # Step 8: Save the Model
+# trainer.save_model("./t5_sql_finetuned")
+# tokenizer.save_pretrained("./t5_sql_finetuned")
+# # Step 9: Test the Model
+test_question = "Find all orders with product_id greater than 5."
+input_text = f"Question: {test_question}"
+inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
+outputs = model.generate(**inputs, max_length=128)
+generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print("Generated SQL:", generated_sql)