From 0999a4569911144dd7d52cf008caa79dcee5487e Mon Sep 17 00:00:00 2001
From: XXXJumpingFrogXXX <wangqx27@outlook.com>
Date: Wed, 9 Oct 2024 15:03:11 +0800
Subject: [PATCH 1/2] This commit includes the following changes: (1) Created
 the latest main.py file and completed some basic FastAPI settings in it. (2)
 Renamed the original main.py file to original_main.py. (3) Kept the existing
 piggy directory and created a chat directory to establish separate routers
 and APIs for each project.

---
 chat/router.py   | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 main.py          |  49 ++++++++++-----------
 original_main.py |  27 ++++++++++++
 3 files changed, 159 insertions(+), 27 deletions(-)
 create mode 100644 chat/router.py
 create mode 100644 original_main.py

diff --git a/chat/router.py b/chat/router.py
new file mode 100644
index 0000000..e2f001e
--- /dev/null
+++ b/chat/router.py
@@ -0,0 +1,110 @@
+from fastapi import APIRouter
+from pydantic import BaseModel
+
+from unsloth import FastLanguageModel
+import torch
+
+max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
+dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
+
+alpaca_prompt = """Below is an instruction that describes a task, along with an input that provides additional context. Write a response that appropriately completes the request.
+
+### Instruction:
+{}
+
+### Input:
+{}
+
+### Response:
+{}"""
+
+class Question(BaseModel):
+    query: str
+
+@router.post("/generate_answer")
+def generate_answer(value: Question):
+    try:
+        llama_model, llama_tokenizer = FastLanguageModel.from_pretrained(
+            model_name = "Antonio27/llama3-8b-4-bit-for-sugar",
+            max_seq_length = max_seq_length,
+            dtype = dtype,
+            load_in_4bit = load_in_4bit,
+        )
+
+        gemma_model, gemma_tokenizer = FastLanguageModel.from_pretrained(
+            model_name = "unsloth/gemma-2-9b-it-bnb-4bit",
+            max_seq_length = max_seq_length,
+            dtype = dtype,
+            load_in_4bit = load_in_4bit,
+        )
+
+        FastLanguageModel.for_inference(llama_model)
+        llama_tokenizer.pad_token = llama_tokenizer.eos_token
+        llama_tokenizer.add_eos_token = True
+
+        inputs = llama_tokenizer(
+            [
+                alpaca_prompt.format(
+                    f'''
+                    Your task is to answer children's questions using simple language.
+                    Explain any difficult words in a way a 3-year-old can understand.
+                    Keep responses under 60 words.
+                    \n\nQuestion: {value.query}
+                    ''',  # instruction
+                    "",  # input
+                    "",  # output - leave this blank for generation!
+                )
+            ], return_tensors="pt").to("cuda")
+
+        outputs = llama_model.generate(**inputs, max_new_tokens=256, temperature=0.6)
+        decoded_outputs = llama_tokenizer.batch_decode(outputs)
+
+        response_text = decoded_outputs[0]
+
+        match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL)
+        if match:
+            initial_response = match.group(1).strip()
+        else:
+            initial_response = ""
+
+        FastLanguageModel.for_inference(gemma_model)
+        gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
+        gemma_tokenizer.add_eos_token = True
+
+        inputs = gemma_tokenizer(
+            [
+                alpaca_prompt.format(
+                    f'''
+                    Modify the given content for a 5-year-old.
+                    Use simple words and phrases.
+                    Remove any repetitive information.
+                    Keep responses under 50 words.
+                    \n\nGiven Content: {initial_response}
+                    ''',  # instruction
+                    "",  # input
+                    "",  # output - leave this blank for generation!
+                )
+            ], return_tensors="pt").to("cuda")
+
+        outputs = gemma_model.generate(**inputs, max_new_tokens=256, temperature=0.6)
+        decoded_outputs = gemma_tokenizer.batch_decode(outputs)
+
+        response_text = decoded_outputs[0]
+
+        match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL)
+        if match:
+            adjusted_response = match.group(1).strip()
+        else:
+            adjusted_response = ""
+
+        return {
+            'success': True,
+            'response': {
+                "result": adjusted_response
+            }
+        }
+
+    except Exception as e:
+        return {'success': False, 'response': str(e)}
+    
\ No newline at end of file
diff --git a/main.py b/main.py
index 35f0a27..d1e736e 100644
--- a/main.py
+++ b/main.py
@@ -1,27 +1,22 @@
-
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-
-# We should rename this
-class AI_Test:
-    def __init__(self):
-        pass
-
-    def generate_bot_response(self, question):
-        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
-        model = GPT2LMHeadModel.from_pretrained("distilgpt2")
-
-        prompt = '''
-        Your task is to answer children's questions using simple language.
-        Explain any difficult words in a way a 3-year-old can understand.
-        Keep responses under 60 words.
-        \n\nQuestion:
-        '''
-
-        input_text = prompt + question
-
-        inputs = tokenizer.encode(input_text, return_tensors='pt')
-        outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
-        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        return answer
+import os
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from chat.router import router as chat_router
+# from piggy.router import router as piggy_router
+
+app = FastAPI(
+    docs_url="/sugar-ai/docs",
+)
+
+app.include_router(chat_router, prefix="/sugar-ai/chat")
+# app.include_router(piggy_router, prefix="/sugar-ai/piggy")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
\ No newline at end of file
diff --git a/original_main.py b/original_main.py
new file mode 100644
index 0000000..35f0a27
--- /dev/null
+++ b/original_main.py
@@ -0,0 +1,27 @@
+
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+
+# We should rename this
+class AI_Test:
+    def __init__(self):
+        pass
+
+    def generate_bot_response(self, question):
+        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+        model = GPT2LMHeadModel.from_pretrained("distilgpt2")
+
+        prompt = '''
+        Your task is to answer children's questions using simple language.
+        Explain any difficult words in a way a 3-year-old can understand.
+        Keep responses under 60 words.
+        \n\nQuestion:
+        '''
+
+        input_text = prompt + question
+
+        inputs = tokenizer.encode(input_text, return_tensors='pt')
+        outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
+        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+        return answer

From c24a43fc1b4db262330bfe67fa144bbe4cfa462f Mon Sep 17 00:00:00 2001
From: XXXJumpingFrogXXX <wangqx27@outlook.com>
Date: Wed, 16 Oct 2024 15:46:50 +0800
Subject: [PATCH 2/2] This commit includes the following changes: (1)Created a
 new main.py file: Established basic FastAPI settings to enhance application
 structure and scalability. This setup includes initial configurations and
 middleware setup, laying the groundwork for future development. (2)Renamed
 the original main.py: Changed to original_main.py to preserve the previous
 version and provide a reference for legacy code, facilitating a smooth
 transition and ensuring no loss of important historical context.
 (3)Refactored project structure: Maintained the existing piggy directory and
 introduced a chat directory. This separation of routers and APIs improves
 modularity, making it easier to manage and extend each project independently.
 These changes aim to improve code organization and prepare the project for
 scalable development with FastAPI.

---
 chat/router.py | 31 +++++++++++++++++++++++--------
 main.py        | 12 ++++++++----
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/chat/router.py b/chat/router.py
index e2f001e..ab00cb9 100644
--- a/chat/router.py
+++ b/chat/router.py
@@ -22,27 +22,33 @@
 class Question(BaseModel):
     query: str
 
+router = APIRouter()
+
 @router.post("/generate_answer")
 def generate_answer(value: Question):
     try:
+        # Load the llama model and tokenizer from the pretrained model
         llama_model, llama_tokenizer = FastLanguageModel.from_pretrained(
-            model_name = "Antonio27/llama3-8b-4-bit-for-sugar",
-            max_seq_length = max_seq_length,
-            dtype = dtype,
-            load_in_4bit = load_in_4bit,
+            model_name="Antonio27/llama3-8b-4-bit-for-sugar",
+            max_seq_length=max_seq_length,
+            dtype=dtype,
+            load_in_4bit=load_in_4bit,
         )
 
+        # Load the gemma model and tokenizer from the pretrained model
         gemma_model, gemma_tokenizer = FastLanguageModel.from_pretrained(
-            model_name = "unsloth/gemma-2-9b-it-bnb-4bit",
-            max_seq_length = max_seq_length,
-            dtype = dtype,
-            load_in_4bit = load_in_4bit,
+            model_name="unsloth/gemma-2-9b-it-bnb-4bit",
+            max_seq_length=max_seq_length,
+            dtype=dtype,
+            load_in_4bit=load_in_4bit,
         )
 
+        # Prepare llama model for inference
         FastLanguageModel.for_inference(llama_model)
         llama_tokenizer.pad_token = llama_tokenizer.eos_token
         llama_tokenizer.add_eos_token = True
 
+        # Tokenize the input question for the llama model
         inputs = llama_tokenizer(
             [
                 alpaca_prompt.format(
@@ -57,21 +63,26 @@ def generate_answer(value: Question):
                 )
             ], return_tensors="pt").to("cuda")
 
+        # Generate output using the llama model
         outputs = llama_model.generate(**inputs, max_new_tokens=256, temperature=0.6)
         decoded_outputs = llama_tokenizer.batch_decode(outputs)
 
+        # Extract the response text
         response_text = decoded_outputs[0]
 
+        # Use regex to find the response section in the output
         match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL)
         if match:
             initial_response = match.group(1).strip()
         else:
             initial_response = ""
 
+        # Prepare gemma model for inference
         FastLanguageModel.for_inference(gemma_model)
         gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
         gemma_tokenizer.add_eos_token = True
 
+        # Tokenize the initial response for the gemma model
         inputs = gemma_tokenizer(
             [
                 alpaca_prompt.format(
@@ -87,17 +98,21 @@ def generate_answer(value: Question):
                 )
             ], return_tensors="pt").to("cuda")
 
+        # Generate adjusted output using the gemma model
         outputs = gemma_model.generate(**inputs, max_new_tokens=256, temperature=0.6)
         decoded_outputs = gemma_tokenizer.batch_decode(outputs)
 
+        # Extract the adjusted response text
         response_text = decoded_outputs[0]
 
+        # Use regex to find the response section in the output
         match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL)
         if match:
             adjusted_response = match.group(1).strip()
         else:
             adjusted_response = ""
 
+        # Return the final adjusted response in a success dictionary
         return {
             'success': True,
             'response': {
diff --git a/main.py b/main.py
index d1e736e..a685474 100644
--- a/main.py
+++ b/main.py
@@ -6,17 +6,21 @@
 from chat.router import router as chat_router
 # from piggy.router import router as piggy_router
 
+# Create a FastAPI application instance with custom documentation URL
 app = FastAPI(
     docs_url="/sugar-ai/docs",
 )
 
+# Include the chat router with a specified prefix for endpoint paths
 app.include_router(chat_router, prefix="/sugar-ai/chat")
+# Include the piggy router with a specified prefix for endpoint paths (currently commented out)
 # app.include_router(piggy_router, prefix="/sugar-ai/piggy")
 
+# Add CORS middleware to allow cross-origin requests from any origin
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
+    allow_origins=["*"],  # Allow requests from any origin
+    allow_credentials=True,  # Allow sending of credentials (e.g., cookies)
+    allow_methods=["*"],  # Allow all HTTP methods
+    allow_headers=["*"],  # Allow all headers
 )
\ No newline at end of file