Skip to content

Commit

Permalink
Merge branch 'main' into huiling-lvm
Browse files Browse the repository at this point in the history
  • Loading branch information
BaoHuiling authored Aug 15, 2024
2 parents 0ca8a2e + 5d9a855 commit 5888b41
Show file tree
Hide file tree
Showing 65 changed files with 917 additions and 390 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/manual-freeze-requirements.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

name: Freeze-requirements

on:
workflow_dispatch:

jobs:
freeze-requirements:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.ref }}

- uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Set up Git
run: |
git config --global user.name "NeuralChatBot"
git config --global user.email "grp_neural_chat_bot@intel.com"
git remote set-url origin https://NeuralChatBot:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIComps.git
- name: Run script
run: |
bash .github/workflows/scripts/freeze_requirements.sh
- name: Commit changes
run: |
git add .
git commit -m "Freeze requirements"
git push
59 changes: 59 additions & 0 deletions .github/workflows/scripts/freeze_requirements.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

function freeze() {
local file=$1
local folder=$(dirname "$file")
local keep_origin_packages="true"
echo "::group::Check $file ..."
pip-compile \
--no-upgrade \
--no-annotate \
--no-header \
--output-file "$folder/freeze.txt" \
"$file"
echo "::endgroup::"
if [[ -e "$folder/freeze.txt" ]]; then
if [[ "$keep_origin_packages" == "true" ]]; then
sed -i '/^\s*#/d; s/#.*//; /^\s*$/d' "$file"
sed -i '/^\s*#/d; s/#.*//; /^\s*$/d' "$folder/freeze.txt"

packages1=$(cut -d'=' -f1 "$file" | tr '[:upper:]' '[:lower:]' | sed 's/[-_]/-/g')
packages2=$(cut -d'=' -f1 "$folder/freeze.txt" | tr '[:upper:]' '[:lower:]' | sed 's/[-_]/-/g')
common_packages=$(comm -12 <(echo "$packages2" | sort) <(echo "$packages1" | sort))

rm "$file"
while IFS= read -r line; do
package=$(echo "$line" | cut -d'=' -f1)
package_transformed=$(echo "$package" | tr '[:upper:]' '[:lower:]' | sed 's/[_-]/-/g')
pattern=$(echo "$package_transformed" | sed 's/\[/\\\[/g; s/\]/\\\]/g')
if echo "$common_packages" | grep -q "^$pattern$"; then
echo "$line" >>"$file"
fi
done <"$folder/freeze.txt"
rm "$folder/freeze.txt"
else
mv "$folder/freeze.txt" "$file"
fi
fi
}

function check_branch_name() {
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "$GITHUB_REF_NAME is protected branch"
exit 0
else
echo "branch name is $GITHUB_REF_NAME"
fi
}

function main() {
check_branch_name
echo "::group::pip install pip-tools" && pip install pip-tools --upgrade && echo "::endgroup::"
export -f freeze
find . -name "requirements.txt" | xargs -n 1 -I {} bash -c 'freeze "$@"' _ {}
}

main
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__pycache__
*.egg-info/
122 changes: 104 additions & 18 deletions comps/asr/whisper/whisper_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class WhisperModel:
"""Convert audio to text."""

def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu"):
def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu", hpu_max_len=8192):
if device == "hpu":
# Explicitly link HPU with Torch
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
Expand All @@ -31,12 +31,11 @@ def __init__(self, model_name_or_path="openai/whisper-small", language="english"
self.model.eval()

self.language = language
self.hpu_max_len = hpu_max_len

if device == "hpu":
# do hpu graph warmup with a long enough input audio
# whisper has a receptive field of 30 seconds
# here we select a relatively long audio (~15 sec) to quickly warmup
self._warmup_whisper_hpu_graph("https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav")
self._warmup_whisper_hpu_graph("https://github.com/Spycsh/assets/raw/main/ljspeech_60s_audio.wav")
self._warmup_whisper_hpu_graph("https://github.com/Spycsh/assets/raw/main/ljspeech_30s_audio.wav")

def _audiosegment_to_librosawav(self, audiosegment):
# https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples
Expand All @@ -59,11 +58,54 @@ def _warmup_whisper_hpu_graph(self, url):
print("[ASR] warmup...")
waveform = AudioSegment.from_file("warmup.wav").set_frame_rate(16000)
waveform = self._audiosegment_to_librosawav(waveform)
# pylint: disable=E1101
inputs = self.processor.feature_extractor(
waveform, return_tensors="pt", sampling_rate=16_000
).input_features.to(self.device)
_ = self.model.generate(inputs, language="chinese")

try:
processed_inputs = self.processor(
waveform,
return_tensors="pt",
truncation=False,
padding="longest",
return_attention_mask=True,
sampling_rate=16000,
)
except RuntimeError as e:
if "Padding size should be less than" in str(e):
# short-form
processed_inputs = self.processor(
waveform,
return_tensors="pt",
sampling_rate=16000,
)
else:
raise e

if processed_inputs.input_features.shape[-1] < 3000:
# short-form
processed_inputs = self.processor(
waveform,
return_tensors="pt",
sampling_rate=16000,
)
else:
processed_inputs["input_features"] = torch.nn.functional.pad(
processed_inputs.input_features,
(0, self.hpu_max_len - processed_inputs.input_features.size(-1)),
value=-1.5,
)
processed_inputs["attention_mask"] = torch.nn.functional.pad(
processed_inputs.attention_mask,
(0, self.hpu_max_len + 1 - processed_inputs.attention_mask.size(-1)),
value=0,
)

_ = self.model.generate(
**(
processed_inputs.to(
self.device,
)
),
language=self.language,
)

def audio2text(self, audio_path):
"""Convert audio to text.
Expand All @@ -80,11 +122,52 @@ def audio2text(self, audio_path):
audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000))
waveform = audio_dataset[0]["audio"]["array"]

# pylint: disable=E1101
inputs = self.processor.feature_extractor(
waveform, return_tensors="pt", sampling_rate=16_000
).input_features.to(self.device)
predicted_ids = self.model.generate(inputs, language=self.language)
try:
processed_inputs = self.processor(
waveform,
return_tensors="pt",
truncation=False,
padding="longest",
return_attention_mask=True,
sampling_rate=16000,
)
except RuntimeError as e:
if "Padding size should be less than" in str(e):
# short-form
processed_inputs = self.processor(
waveform,
return_tensors="pt",
sampling_rate=16000,
)
else:
raise e
if processed_inputs.input_features.shape[-1] < 3000:
# short-form
processed_inputs = self.processor(
waveform,
return_tensors="pt",
sampling_rate=16000,
)
elif self.device == "hpu":
processed_inputs["input_features"] = torch.nn.functional.pad(
processed_inputs.input_features,
(0, self.hpu_max_len - processed_inputs.input_features.size(-1)),
value=-1.5,
)
processed_inputs["attention_mask"] = torch.nn.functional.pad(
processed_inputs.attention_mask,
(0, self.hpu_max_len + 1 - processed_inputs.attention_mask.size(-1)),
value=0,
)

predicted_ids = self.model.generate(
**(
processed_inputs.to(
self.device,
)
),
language=self.language,
)
# pylint: disable=E1101
result = self.processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
if self.language in ["chinese", "mandarin"]:
Expand All @@ -96,20 +179,23 @@ def audio2text(self, audio_path):


if __name__ == "__main__":
asr = WhisperModel(language="english")
asr = WhisperModel(model_name_or_path="openai/whisper-small", language="english", device="cpu")

# Test multilanguage asr
asr.language = "chinese"
urllib.request.urlretrieve(
"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav",
"sample.wav",
)
asr.language = "chinese"
text = asr.audio2text("sample.wav")

asr.language = "english"
urllib.request.urlretrieve(
"https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav",
"sample.wav",
)
text = asr.audio2text("sample.wav")

os.remove("sample.wav")
for i in [5, 10, 30, 60]:
urllib.request.urlretrieve(f"https://github.com/Spycsh/assets/raw/main/ljspeech_{i}s_audio.wav", "sample.wav")
text = asr.audio2text("sample.wav")
2 changes: 1 addition & 1 deletion comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ class ChatCompletionRequest(BaseModel):
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
max_tokens: Optional[int] = 16 # use https://platform.openai.com/docs/api-reference/completions/create
max_tokens: Optional[int] = 1024 # use https://platform.openai.com/docs/api-reference/completions/create
n: Optional[int] = 1
presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None
Expand Down
2 changes: 1 addition & 1 deletion comps/dataprep/milvus/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530))
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus")

MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/root/bce-embedding-base_v1")
MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bce-embedding-base_v1")
MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "")
os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT
os.environ["OPENAI_API_KEY"] = "Dummy key"
1 change: 1 addition & 0 deletions comps/dataprep/pgvector/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ psycopg2-binary
pymupdf
pyspark
python-docx
python-multipart
python-pptx
sentence_transformers
shortuuid
Expand Down
23 changes: 18 additions & 5 deletions comps/dataprep/qdrant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ docker build -t opea/dataprep-qdrant:latest --build-arg https_proxy=$https_proxy
## Run Docker with CLI

```bash
docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest
docker run -d --name="dataprep-qdrant-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-qdrant:latest
```

## Setup Environment Variables

```bash
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
export QDRANT=${host_ip}
export QDRANT_HOST=${host_ip}
export QDRANT_PORT=6333
export COLLECTION_NAME=${your_collection_name}
```
Expand All @@ -72,19 +72,32 @@ docker compose -f docker-compose-dataprep-qdrant.yaml up -d
Once document preparation microservice for Qdrant is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file1.txt" \
http://localhost:6007/v1/dataprep
```

You can specify chunk_size and chunk_size by the following commands.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file1.txt" \
-F "chunk_size=1500" \
-F "chunk_overlap=100" \
http://localhost:6007/v1/dataprep
```

We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".

Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","process_table":true,"table_strategy":"hq"}' http://localhost:6000/v1/dataprep
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./your_file.pdf" \
-F "process_table=true" \
-F "table_strategy=hq" \
http://localhost:6007/v1/dataprep
```
2 changes: 1 addition & 1 deletion comps/dataprep/qdrant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

# Qdrant configuration
QDRANT_HOST = os.getenv("QDRANT", "localhost")
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-qdrant")

Expand Down
12 changes: 9 additions & 3 deletions comps/dataprep/qdrant/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
build-essential \
libgl1-mesa-glx \
libjemalloc-dev \
default-jre \
vim

RUN useradd -m -s /bin/bash user && \
Expand All @@ -22,13 +23,18 @@ USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip && \
if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \
RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/dataprep/qdrant/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

USER root

RUN mkdir -p /home/user/comps/dataprep/qdrant/uploaded_files && chown -R user /home/user/comps/dataprep/qdrant/uploaded_files

USER user

WORKDIR /home/user/comps/dataprep/qdrant

ENTRYPOINT ["python", "prepare_doc_qdrant.py"]

Loading

0 comments on commit 5888b41

Please sign in to comment.