From 0999a4569911144dd7d52cf008caa79dcee5487e Mon Sep 17 00:00:00 2001 From: XXXJumpingFrogXXX Date: Wed, 9 Oct 2024 15:03:11 +0800 Subject: [PATCH 1/2] This commit includes the following changes: (1) Created the latest main.py file and completed some basic FastAPI settings in it. (2) Renamed the original main.py file to original_main.py. (3) Kept the existing piggy directory and created a chat directory to establish separate routers and APIs for each project. --- chat/router.py | 110 +++++++++++++++++++++++++++++++++++++++++++++++ main.py | 49 ++++++++++----------- original_main.py | 27 ++++++++++++ 3 files changed, 159 insertions(+), 27 deletions(-) create mode 100644 chat/router.py create mode 100644 original_main.py diff --git a/chat/router.py b/chat/router.py new file mode 100644 index 0000000..e2f001e --- /dev/null +++ b/chat/router.py @@ -0,0 +1,110 @@ +from fastapi import APIRouter +from pydantic import BaseModel + +from unsloth import FastLanguageModel +import torch + +max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! +dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ +load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. + +alpaca_prompt = """Below is an instruction that describes a task, along with an input that provides additional context. Write a response that appropriately completes the request. + +### Instruction: +{} + +### Input: +{} + +### Response: +{}""" + +class Question(BaseModel): + query: str + +@router.post("/generate_answer") +def generate_answer(value: Question): + try: + llama_model, llama_tokenizer = FastLanguageModel.from_pretrained( + model_name = "Antonio27/llama3-8b-4-bit-for-sugar", + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, + ) + + gemma_model, gemma_tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/gemma-2-9b-it-bnb-4bit", + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, + ) + + FastLanguageModel.for_inference(llama_model) + llama_tokenizer.pad_token = llama_tokenizer.eos_token + llama_tokenizer.add_eos_token = True + + inputs = llama_tokenizer( + [ + alpaca_prompt.format( + f''' + Your task is to answer children's questions using simple language. + Explain any difficult words in a way a 3-year-old can understand. + Keep responses under 60 words. + \n\nQuestion: {value.query} + ''', # instruction + "", # input + "", # output - leave this blank for generation! + ) + ], return_tensors="pt").to("cuda") + + outputs = llama_model.generate(**inputs, max_new_tokens=256, temperature=0.6) + decoded_outputs = llama_tokenizer.batch_decode(outputs) + + response_text = decoded_outputs[0] + + match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL) + if match: + initial_response = match.group(1).strip() + else: + initial_response = "" + + FastLanguageModel.for_inference(gemma_model) + gemma_tokenizer.pad_token = gemma_tokenizer.eos_token + gemma_tokenizer.add_eos_token = True + + inputs = gemma_tokenizer( + [ + alpaca_prompt.format( + f''' + Modify the given content for a 5-year-old. + Use simple words and phrases. + Remove any repetitive information. + Keep responses under 50 words. + \n\nGiven Content: {initial_response} + ''', # instruction + "", # input + "", # output - leave this blank for generation! + ) + ], return_tensors="pt").to("cuda") + + outputs = gemma_model.generate(**inputs, max_new_tokens=256, temperature=0.6) + decoded_outputs = gemma_tokenizer.batch_decode(outputs) + + response_text = decoded_outputs[0] + + match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL) + if match: + adjusted_response = match.group(1).strip() + else: + adjusted_response = "" + + return { + 'success': True, + 'response': { + "result": adjusted_response + } + } + + except Exception as e: + return {'success': False, 'response': str(e)} + \ No newline at end of file diff --git a/main.py b/main.py index 35f0a27..d1e736e 100644 --- a/main.py +++ b/main.py @@ -1,27 +1,22 @@ - -from transformers import GPT2Tokenizer, GPT2LMHeadModel - - -# We should rename this -class AI_Test: - def __init__(self): - pass - - def generate_bot_response(self, question): - tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") - model = GPT2LMHeadModel.from_pretrained("distilgpt2") - - prompt = ''' - Your task is to answer children's questions using simple language. - Explain any difficult words in a way a 3-year-old can understand. - Keep responses under 60 words. - \n\nQuestion: - ''' - - input_text = prompt + question - - inputs = tokenizer.encode(input_text, return_tensors='pt') - outputs = model.generate(inputs, max_length=150, num_return_sequences=1) - answer = tokenizer.decode(outputs[0], skip_special_tokens=True) - - return answer +import os +import uvicorn +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from chat.router import router as chat_router +# from piggy.router import router as piggy_router + +app = FastAPI( + docs_url="/sugar-ai/docs", +) + +app.include_router(chat_router, prefix="/sugar-ai/chat") +# app.include_router(piggy_router, prefix="/sugar-ai/piggy") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) \ No newline at end of file diff --git a/original_main.py b/original_main.py new file mode 100644 index 0000000..35f0a27 --- /dev/null +++ b/original_main.py @@ -0,0 +1,27 @@ + +from transformers import GPT2Tokenizer, GPT2LMHeadModel + + +# We should rename this +class AI_Test: + def __init__(self): + pass + + def generate_bot_response(self, question): + tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") + model = GPT2LMHeadModel.from_pretrained("distilgpt2") + + prompt = ''' + Your task is to answer children's questions using simple language. + Explain any difficult words in a way a 3-year-old can understand. + Keep responses under 60 words. + \n\nQuestion: + ''' + + input_text = prompt + question + + inputs = tokenizer.encode(input_text, return_tensors='pt') + outputs = model.generate(inputs, max_length=150, num_return_sequences=1) + answer = tokenizer.decode(outputs[0], skip_special_tokens=True) + + return answer From c24a43fc1b4db262330bfe67fa144bbe4cfa462f Mon Sep 17 00:00:00 2001 From: XXXJumpingFrogXXX Date: Wed, 16 Oct 2024 15:46:50 +0800 Subject: [PATCH 2/2] This commit includes the following changes: (1)Created a new main.py file: Established basic FastAPI settings to enhance application structure and scalability. This setup includes initial configurations and middleware setup, laying the groundwork for future development. (2)Renamed the original main.py: Changed to original_main.py to preserve the previous version and provide a reference for legacy code, facilitating a smooth transition and ensuring no loss of important historical context. (3)Refactored project structure: Maintained the existing piggy directory and introduced a chat directory. This separation of routers and APIs improves modularity, making it easier to manage and extend each project independently. These changes aim to improve code organization and prepare the project for scalable development with FastAPI. --- chat/router.py | 31 +++++++++++++++++++++++-------- main.py | 12 ++++++++---- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/chat/router.py b/chat/router.py index e2f001e..ab00cb9 100644 --- a/chat/router.py +++ b/chat/router.py @@ -22,27 +22,33 @@ class Question(BaseModel): query: str +router = APIRouter() + @router.post("/generate_answer") def generate_answer(value: Question): try: + # Load the llama model and tokenizer from the pretrained model llama_model, llama_tokenizer = FastLanguageModel.from_pretrained( - model_name = "Antonio27/llama3-8b-4-bit-for-sugar", - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, + model_name="Antonio27/llama3-8b-4-bit-for-sugar", + max_seq_length=max_seq_length, + dtype=dtype, + load_in_4bit=load_in_4bit, ) + # Load the gemma model and tokenizer from the pretrained model gemma_model, gemma_tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/gemma-2-9b-it-bnb-4bit", - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, + model_name="unsloth/gemma-2-9b-it-bnb-4bit", + max_seq_length=max_seq_length, + dtype=dtype, + load_in_4bit=load_in_4bit, ) + # Prepare llama model for inference FastLanguageModel.for_inference(llama_model) llama_tokenizer.pad_token = llama_tokenizer.eos_token llama_tokenizer.add_eos_token = True + # Tokenize the input question for the llama model inputs = llama_tokenizer( [ alpaca_prompt.format( @@ -57,21 +63,26 @@ def generate_answer(value: Question): ) ], return_tensors="pt").to("cuda") + # Generate output using the llama model outputs = llama_model.generate(**inputs, max_new_tokens=256, temperature=0.6) decoded_outputs = llama_tokenizer.batch_decode(outputs) + # Extract the response text response_text = decoded_outputs[0] + # Use regex to find the response section in the output match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL) if match: initial_response = match.group(1).strip() else: initial_response = "" + # Prepare gemma model for inference FastLanguageModel.for_inference(gemma_model) gemma_tokenizer.pad_token = gemma_tokenizer.eos_token gemma_tokenizer.add_eos_token = True + # Tokenize the initial response for the gemma model inputs = gemma_tokenizer( [ alpaca_prompt.format( @@ -87,17 +98,21 @@ def generate_answer(value: Question): ) ], return_tensors="pt").to("cuda") + # Generate adjusted output using the gemma model outputs = gemma_model.generate(**inputs, max_new_tokens=256, temperature=0.6) decoded_outputs = gemma_tokenizer.batch_decode(outputs) + # Extract the adjusted response text response_text = decoded_outputs[0] + # Use regex to find the response section in the output match = re.search(r"### Response:(.*?)(?=\n###|$)", response_text, re.DOTALL) if match: adjusted_response = match.group(1).strip() else: adjusted_response = "" + # Return the final adjusted response in a success dictionary return { 'success': True, 'response': { diff --git a/main.py b/main.py index d1e736e..a685474 100644 --- a/main.py +++ b/main.py @@ -6,17 +6,21 @@ from chat.router import router as chat_router # from piggy.router import router as piggy_router +# Create a FastAPI application instance with custom documentation URL app = FastAPI( docs_url="/sugar-ai/docs", ) +# Include the chat router with a specified prefix for endpoint paths app.include_router(chat_router, prefix="/sugar-ai/chat") +# Include the piggy router with a specified prefix for endpoint paths (currently commented out) # app.include_router(piggy_router, prefix="/sugar-ai/piggy") +# Add CORS middleware to allow cross-origin requests from any origin app.add_middleware( CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], + allow_origins=["*"], # Allow requests from any origin + allow_credentials=True, # Allow sending of credentials (e.g., cookies) + allow_methods=["*"], # Allow all HTTP methods + allow_headers=["*"], # Allow all headers ) \ No newline at end of file