from fastapi import FastAPI from fastapi.responses import HTMLResponse from pydantic import BaseModel from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM import uvicorn import torch app = FastAPI() pipe = pipeline("text-generation", model="Qwen/Qwen2.5-0.5B-Instruct") pipe2 = pipeline("summarization", model="Falconsai/text_summarization") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") # Load summarization model and tokenizer (T5 model) sum_tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization") sum_model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization") class GenRequest(BaseModel): text: str max_new_tokens: int = 150 do_sample: bool = False # set True if you want to use temperature/top_p, etc. @app.post("/generate") def generate(req: GenRequest): out = pipe( req.text, max_new_tokens=req.max_new_tokens, do_sample=req.do_sample, truncation=True, return_full_text=False, ) return {"generated_text": out[0]["generated_text"]} @app.post("/summarize") def summarize(req: GenRequest): inputs = sum_tokenizer(req.text, return_tensors="pt", truncation=True, max_length=512) # Generate summary tokens (T5 is encoder-decoder, so only generated tokens are returned) with torch.no_grad(): generated_ids = sum_model.generate( inputs["input_ids"], max_new_tokens=req.max_new_tokens, do_sample=req.do_sample, output_scores=True, return_dict_in_generate=True, pad_token_id=sum_tokenizer.pad_token_id ) # Get generated token ids all_token_ids = generated_ids.sequences[0] # Get logits for generated tokens output = [] # Process generated tokens (T5 doesn't output input tokens in generation) if hasattr(generated_ids, 'scores') and generated_ids.scores: for i, scores in enumerate(generated_ids.scores): token_id = all_token_ids[i] logit_scores = scores[0] _, top_indices = torch.topk(logit_scores, k=5, dim=-1) logit_list = sum_tokenizer.decode(top_indices) decoded = sum_tokenizer.decode([token_id]) output.append((decoded, logit_list)) return {"output": output} @app.post("/generate_tokens") def gen_tokens(req: GenRequest): inputs = tokenizer(req.text, return_tensors="pt") input_length = inputs["input_ids"].shape[1] # Generate new tokens with torch.no_grad(): generated_ids = model.generate( inputs["input_ids"], max_new_tokens=req.max_new_tokens, do_sample=req.do_sample, output_scores=True, return_dict_in_generate=True, pad_token_id=tokenizer.eos_token_id ) # Get all token ids (input + generated) all_token_ids = generated_ids.sequences[0] # Get logits for generated tokens output = [] # Process input tokens with torch.no_grad(): input_output = model(**inputs) input_logits = input_output.logits for i, token_id in enumerate(inputs["input_ids"][0]): logit_scores = input_logits[0, i, :] _, top_indices = torch.topk(logit_scores, k=5, dim=-1) logit_list = tokenizer.decode(top_indices) decoded = tokenizer.decode([token_id]) output.append((decoded, logit_list)) # Process generated tokens if hasattr(generated_ids, 'scores') and generated_ids.scores: for i, scores in enumerate(generated_ids.scores): token_id = all_token_ids[input_length + i] logit_scores = scores[0] _, top_indices = torch.topk(logit_scores, k=5, dim=-1) logit_list = tokenizer.decode(top_indices) decoded = tokenizer.decode([token_id]) output.append((decoded, logit_list)) return {"output": output} @app.get("/", response_class=HTMLResponse) def index(): html_content = """ Text Generation & Summarization

🤖 Text Generation & Summarization

Generate Summarize

Result:

Next Predicted Tokens:

""" return html_content if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)