I'm studying the process of distributing artificial intelligence modules through FastAPI.
I created a FastAPI app that answers questions using a pre-learned Machine Learning model.
In this case, it is not a problem for one user to use it, but when multiple users use it at the same time, the response may be too slow.
Hence, when multiple users enter a question, is there any way to copy the model and load it in at once?
class sentencebert_ai():
def __init__(self) -> None:
super().__init__()
def ask_query(self,query, topN):
startt = time.time()
ask_result = []
score = []
result_value = []
embedder = torch.load(model_path)
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
query_embedding = embedder.encode(query, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] #torch.Size([121])121개의 말뭉치에 대한 코사인 유사도 값이다.
cos_scores = cos_scores.cpu()
top_results = np.argpartition(-cos_scores, range(topN))[0:topN]
for idx in top_results[0:topN]:
ask_result.append(corpusid[idx].item())
#.item()으로 접근하는 이유는 tensor(5)에서 해당 숫자에 접근하기 위한 방식이다.
score.append(round(cos_scores[idx].item(),3))
#서버에 json array 형태로 내보내기 위한 작업
for i,e in zip(ask_result,score):
result_value.append({"pred_id":i,"pred_weight":e})
endd = time.time()
print('시간체크',endd-startt)
return result_value
# return ','.join(str(e) for e in ask_result),','.join(str(e) for e in score)
class Item_inference(BaseModel):
text : str
topN : Optional[int] = 1
@app.post("/retrieval", tags=["knowledge recommendation"])
async def Knowledge_recommendation(item: Item_inference):
# db.append(item.dict())
item.dict()
results = _ai.ask_query(item.text, item.topN)
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--port", default='9003', type=int)
# parser.add_argument("--mode", default='cpu', type=str, help='cpu for CPU mode, gpu for GPU mode')
args = parser.parse_args()
_ai = sentencebert_ai()
uvicorn.run(app, host="0.0.0.0", port=args.port,workers=4)
corrected version
@app.post("/aaa") def your_endpoint(request: Request, item:Item_inference): start = time.time() model = request.app.state.model item.dict() #커널 실행시 필요 _ai = sentencebert_ai() results = _ai.ask_query(item.text, item.topN,model) end = time.time() print(end-start) return results ```
ask_query
is implemented). Since this is probably CPU bound, you might want to instead start multiple instances (worker threads/processes) of your application when using gunicorn or similar, so that you can use more processor cores efficiently. – Astylaruvicorn
, no need to usegunicorn
. – Pushbikeworkers
. – Astylar