[Python] How to implement voice activity detection and stop Twilio voice bot on customer...

Stack · Outubro 7, 2024

I am building a voice bot using Twilio and Django. I have set up an outbound call with media streaming, which is working fine. However, I want to implement voice activity detection so that if the customer interrupts while the voice bot is speaking, the bot stops speaking immediately. I am looking for guidance on how to achieve this using Twilio's Media Streams feature

I have set up media streaming for the Twilio outbound call using the verb in the TwiML response. This is my view:

class StartOutboundCallingView(APIView):

client = Client(sid, token)

def post(self, request, format=None):
from_ = request.data.get("from")
to = request.data.get("to")

if "from" not in request.data or "to" not in request.data:
return Response(
{"Required Field": "from and to are required fields"},
status=status.HTTP_400_BAD_REQUEST,
)

resp = VoiceResponse()

call = self.client.calls.create(
twiml=generate_twiml(),
to=to,
from_=from_,
)
return Response({"call_id": call.sid}, status=status.HTTP_200_OK)

this is how I am generating the twiml

def generate_twiml():
return f'''
<Response>
<Connect>
<Stream url="{url}"/>
</Connect>
</Response>
'''

this is my websocket consumer:

class TwilioWS(WebsocketConsumer):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.groq_ai = None
self.call_id = None
self.stream_id = None
self.transcriber = None
self.tts = None
self.thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.loop = asyncio.get_event_loop()

def connect(self):
self.transcriber: DeepgramSTT = DeepgramSTT(self)
self.transcriber.start_transcription()
asyncio.run(self.init_groq())
return super().connect()

async def init_groq(self) -> None:
self.groq_ai: GroqAI = GroqAI(self)
self.groq_ai.initialize()

def disconnect(self) -> None:
self.transcriber.disconnect()

def receive(self, text_data: str) -> None:
"""
Handle incoming WebSocket message.

Args:
text_data (str): Base64 encoded audio stream.

Returns:
None
"""
if not text_data:
self.transcriber.disconnect()
return

data: dict = json.loads(text_data)
event: str = data.get('event')

if event == "start":
self.streamSid: str = data['start']['streamSid']
self.call_id: str = data['start']['callSid']

if event == "media":
self.handle_transcription(
base64.b64decode(data["media"]["payload"]),
self.call_id,
)

if event == "stop":
self.transcriber.disconnect()

def handle_transcription(self, chunk: bytes, call_id: str) -> None:

"""Handle transcription of a chunk of audio.
Args:
chunk (bytes): The audio chunk to transcribe.
call_id (str): The ID of the call.
"""

self.thread_pool.submit(
self.transcriber.transcribe, chunk, call_id
)

// return response audio to twilio
def handle_transcribed_audio(self, audio_data):
# start_time = time.time()
encoded_audio = base64.b64encode(audio_data).decode('UTF-8')
self.send(json.dumps({
'streamSid': self.streamSid,
'event': 'media',
'media': {
'payload': encoded_audio,
},
}))

Continue reading...

Logar ou Criar uma Conta

[Python] How to implement voice activity detection and stop Twilio voice bot on customer...

Stack Membro Participativo

Compartilhe esta Página

Logar ou Criar uma Conta

[Python] How to implement voice activity detection and stop Twilio voice bot on customer...

Stack Membro Participativo

Compartilhe esta Página

Pesquisas Úteis