| from openai import OpenAI | |
| class NVIDIAInferenceService: | |
| def __init__(self, api_key: str, model_name: str = "deepseek-ai/deepseek-v3.1"): | |
| self.client = OpenAI( | |
| base_url="https://integrate.api.nvidia.com/v1", | |
| api_key=api_key | |
| ) | |
| self.model_name = model_name | |
| def generate_response(self, prompt: str): | |
| completion = self.client.chat.completions.create( | |
| model=self.model_name, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.2, | |
| top_p=0.7, | |
| max_tokens=8192, | |
| extra_body={"chat_template_kwargs": {"thinking": True}}, | |
| stream=True | |
| ) | |
| response = "" | |
| for chunk in completion: | |
| reasoning = getattr(chunk.choices[0].delta, "reasoning_content", None) | |
| if reasoning: | |
| response += reasoning | |
| if chunk.choices[0].delta.content is not None: | |
| response += chunk.choices[0].delta.content | |
| return response | |