oscillate vits duration

c7362aa 9 months ago

20.8 kB


	# -- coding: utf-8 --
	import numpy as np
	import soundfile
	from Utils.text_utils import split_into_sentences
	import msinference
	import re
	import srt
	import time
	import subprocess
	import cv2
	from pathlib import Path
	from types import SimpleNamespace
	from flask import Flask, request, send_from_directory
	from moviepy.video.io.VideoFileClip import VideoFileClip
	from moviepy.video.VideoClip import ImageClip
	from audiocraft.builders import AudioGen

	CACHE_DIR = 'flask_cache/'
	sound_generator = AudioGen().to('cuda:0').eval() # duration chosen in generate()

	Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)


	def resize_with_white_padding(image):
	"""
	Resizes an image to 1920x1080 while preserving aspect ratio
	by adding white padding.

	Args:
	image (np.ndarray): The input image as a NumPy array.

	Returns:
	np.ndarray: The resized image with white padding.
	"""
	h, w = image.shape[:2]
	target_h, target_w = 1080, 1920
	aspect_ratio = w / h
	target_aspect_ratio = target_w / target_h

	if aspect_ratio > target_aspect_ratio:
	# Image is wider than the target, pad top and bottom
	new_w = target_w
	new_h = int(new_w / aspect_ratio)
	resized_image = cv2.resize(
	image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
	padding_h = target_h - new_h
	top_padding = padding_h // 2
	bottom_padding = padding_h - top_padding
	padding = [(top_padding, bottom_padding), (0, 0)]
	if len(image.shape) == 3:
	padding.append((0, 0)) # Add padding for color channels
	padded_image = np.pad(resized_image, padding,
	mode='constant', constant_values=255)
	elif aspect_ratio < target_aspect_ratio:
	# Image is taller than the target, pad left and right
	new_h = target_h
	new_w = int(new_h * aspect_ratio)
	resized_image = cv2.resize(
	image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
	padding_w = target_w - new_w
	left_padding = padding_w // 2
	right_padding = padding_w - left_padding
	padding = [(0, 0), (left_padding, right_padding)]
	if len(image.shape) == 3:
	padding.append((0, 0)) # Add padding for color channels
	padded_image = np.pad(resized_image, padding,
	mode='constant', constant_values=255)
	else:
	# Aspect ratio matches the target, just resize
	padded_image = cv2.resize(
	image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)

	return padded_image # image 2 speech


	def _shorten(filename):
	return filename.replace("/", "")[-6:]


	def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
	'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
	# initialize the dimensions of the image to be resized and
	# grab the image size
	dim = None
	(h, w) = image.shape[:2]

	# if both the width and height are None, then return the
	# original image
	if width is None and height is None:
	return image

	# check to see if the width is None
	if width is None:
	# calculate the ratio of the height and construct the
	# dimensions
	r = height / float(h)
	dim = (int(w * r), height)

	# otherwise, the height is None
	else:
	# calculate the ratio of the width and construct the
	# dimensions
	r = width / float(w)
	dim = (width, int(h * r))

	# resize the image
	resized = cv2.resize(image, dim, interpolation=inter)

	# return the resized image
	return resized


	def overlay(x, soundscape=None):
	if soundscape is not None:
	background = sound_generator.generate(soundscape,
	duration=len(x)/16000 + .74, # duration seconds
	).detach().cpu().numpy()
	x = .6 * x + .4 * background[:len(x)]
	return x


	def tts_multi_sentence(precomputed_style_vector=None,
	text=None,
	voice=None,
	soundscape=None,
	speed=None):
	'''create 24kHZ np.array with tts

	precomputed_style_vector : required if en_US or en_UK in voice, so
	to perform affective TTS.
	text : string
	voice : string or None (falls to styleTTS)
	soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
	'''

	# StyleTTS2 - English

	if precomputed_style_vector is not None:
	x = []
	if not isinstance(text, list):
	text = split_into_sentences(text) # Avoid OOM in StyleTTS2
	for _sentence in text:

	# StyleTTS2 - pronounciation Fx

	# .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
	_sentence = _sentence.lower()
	if 'vctk_low#p326' in voice:
	# fix sounding of sleepy AAABS TRAACT
	_sentence = _sentence.replace(
	'abstract', 'ahbstract') # 'ahstract'
	x.append(msinference.inference(_sentence,
	precomputed_style_vector)
	)
	x = np.concatenate(x)

	# Fallback - MMS TTS - Non-English

	else:

	# dont split foreign sentences: Avoids speaker change issue
	x = msinference.foreign(text=text,
	lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
	speed=speed) # normalisation externally

	# volume

	x /= 1.12 * np.abs(x).max() + 1e-7 # amplify speech to full [-1,1] No amplification / normalisation on soundscapes

	return overlay(x, soundscape=soundscape)


	# voices = {}
	# import phonemizer
	# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
	app = Flask(__name__)


	@app.route("/", methods=['GET', 'POST', 'PUT'])
	def serve_wav():
	# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
	# object-into-a-representation-suitable-for-mongodb
	r = request.form.to_dict(flat=False)

	# Physically Save Client Files
	for filename, obj in request.files.items():
	obj.save(f'{CACHE_DIR}{_shorten(filename)}')

	print('Saved all files on Server Side\n\n')

	args = SimpleNamespace(
	# crop last letters from original filename & use as tmp
	text=None if r.get('text') is None else CACHE_DIR +
	_shorten(r.get('text')[0]),
	video=None if r.get('video') is None else CACHE_DIR +
	_shorten(r.get('video')[0]),
	image=None if r.get('image') is None else CACHE_DIR +
	_shorten(r.get('image')[0]),
	native=None if r.get('native') is None else CACHE_DIR +
	_shorten(r.get('native')[0]),
	affective=r.get('affective')[0],
	voice=r.get('voice')[0],
	speed=None, # obsolete due to oscillating MMS TTS VITS duration per language
	soundscape=r.get('soundscape')[0] if r.get(
	'soundscape') is not None else None,
	)
	# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')

	print(args, 'ENTER Script')
	do_video_dub = True if args.text.endswith('.srt') else False

	SILENT_VIDEO = '_silent_video.mp4'
	AUDIO_TRACK = '_audio_track.wav'

	if do_video_dub:
	print(
	'==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
	with open(args.text, "r") as f:
	s = f.read()
	text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
	for j in srt.parse(s)]
	assert args.video is not None
	native_audio_file = '_tmp.wav'
	subprocess.run(
	["ffmpeg",
	"-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
	"-i",
	args.video,
	"-f",
	"mp3",
	"-ar",
	"16000", # "22050 for mimic3",
	"-vn",
	native_audio_file])
	x_native, _ = soundfile.read(native_audio_file) # reads mp3

	# stereo in video
	if x_native.ndim > 1:
	x_native = x_native[:, 0] # stereo

	# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
	else:
	with open(args.text, 'r') as f:
	text = ''.join(f)
	# delete spaces / split in list in tts_multi_sentence()
	text = re.sub(' +', ' ', text)

	# == STYLE VECTOR ==

	precomputed_style_vector = None

	if args.native: # Voice Cloning
	try:
	precomputed_style_vector = msinference.compute_style(args.native)
	except soundfile.LibsndfileError: # Fallback - internal voice
	print('\n Could not voice clone audio:', args.native,
	'fallback to video or Internal TTS voice.\n')
	if do_video_dub: # Clone voice via Video
	native_audio_file = args.video.replace('.', '').replace('/', '')
	native_audio_file += '__native_audio_track.wav'
	soundfile.write('tgt_spk.wav',
	np.concatenate([
	x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
	precomputed_style_vector = msinference.compute_style('tgt_spk.wav')

	# NOTE: style vector is normally None here - except if --native arg was passed

	# Native English Accent TTS
	if precomputed_style_vector is None:
	if 'en_US' in args.voice or 'en_UK' in args.voice:
	_dir = '/' if args.affective else '_v2/'
	precomputed_style_vector = msinference.compute_style(
	'assets/wavs/style_vector' + _dir + args.voice.replace(
	'/', '_').replace(
	'#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '') + '.wav')
	# Non-Native English Accent TTS
	elif '_' in args.voice:
	precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
	'/', '_').replace('#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '') + '.wav')
	# Foreign Lang
	else:
	print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')

	# NOTE : precomputed_style_vector is still None if MMS TTS

	# == SILENT VIDEO ==

	if args.video is not None:
	# banner - precomput @ 1920 pixels
	frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
	font = cv2.FONT_HERSHEY_SIMPLEX
	bottomLeftCornerOfText = (240, 74) # w,h
	fontScale = 2
	fontColor = (255, 255, 255)
	thickness = 4
	lineType = 2
	cv2.putText(frame_tts, 'TTS',
	bottomLeftCornerOfText,
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)
	# cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
	# ====================================== NATIVE VOICE
	frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
	font = cv2.FONT_HERSHEY_SIMPLEX
	bottomLeftCornerOfText = (101, 74) # w,h
	fontScale = 2
	fontColor = (255, 255, 255)
	thickness = 4
	lineType = 1000
	cv2.putText(frame_orig, 'ORIGINAL VOICE',
	bottomLeftCornerOfText,
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)

	print(f'\n______________________________\n'
	f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
	f'\n______________________________\n')
	# ====SILENT VIDEO EXTRACT====
	# DONLOAD SRT from youtube
	#
	# yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
	#
	#
	# .mkv ->.mp4 moviepy loads only .mp4
	#
	# ffmpeg -y -i Distaff\ \[qVonBgRXcWU\].mkv -c copy -c:a aac Distaff_qVonBgRXcWU.mp4
	# video_file, srt_file = ['assets/Head_of_fortuna.mp4',
	# 'assets/head_of_fortuna_en.srt']
	#
	video_file = args.video
	vf = VideoFileClip(video_file)

	# GET 1st FRAME to OBTAIN frame RESOLUTION
	h, w, _ = vf.get_frame(0).shape
	frame_tts = _resize(frame_tts, width=w)
	frame_orig = _resize(frame_orig, width=w)
	h, w, _ = frame_orig.shape

	try:

	# inpaint banner to say if native voice
	num = x_native.shape[0]
	# fade heaviside
	is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))

	def inpaint_banner(get_frame, t):
	'''blend banner - (now plays) tts or native voic
	'''

	im = np.copy(get_frame(t)) # pic

	ix = int(t * 16000) # ix may overflow the is_tts.shape
	if ix < num:
	if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
	frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
	# then is considered a "local variable" thus the "outer var"
	# is not observed by python raising referenced before assign
	else:
	frame = frame_orig
	# For the ix that is out of bounds of num assume frame_tts
	else:
	frame = frame_tts

	# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)

	offset_h = 24

	print(
	f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')

	im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
	+ .6 * frame).astype(np.uint8)

	# im2 = np.concatenate([im, frame_tts], 0)
	# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
	return im # np.concatenate([im, frane_ttts], 0)

	except UnboundLocalError: # args.native == False

	def inpaint_banner(get_frame, t):

	im = np.copy(get_frame(t))

	h, w, _ = frame_tts.shape # frame = banner
	if w != im.shape[1]: # rsz banners to fit video w
	local_frame = _resize(frame_tts, width=im.shape[1])
	offset_h = 24
	im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
	+ .6 * local_frame).astype(np.uint8)
	return im
	vf = vf.fl(inpaint_banner)
	vf.write_videofile(SILENT_VIDEO)

	# ==== TTS .srt ====

	if do_video_dub:
	OUT_FILE = 'tmp.mp4' # args.out_file + '_video_dub.mp4'
	subtitles = text
	MAX_LEN = int(subtitles[-1][2] + 17) * 16000
	# 17 extra seconds fail-safe for long-last-segment
	print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
	pieces = []
	for k, (_text_, orig_start, orig_end) in enumerate(subtitles):

	pieces.append(tts_multi_sentence(text=_text_,
	precomputed_style_vector=precomputed_style_vector,
	voice=args.voice,
	soundscape=args.soundscape,
	speed=args.speed)
	)
	total = np.concatenate(pieces, 0)
	# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
	# PAD SHORTEST of TTS / NATIVE
	if len(x_native) > len(total):
	total = np.pad(
	total, (0, max(0, x_native.shape[0] - total.shape[0])))

	else: # pad native to len of is_tts & total
	x_native = np.pad(
	x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
	# print(total.shape, x_native.shape, 'PADDED TRACKS')
	soundfile.write(AUDIO_TRACK,
	# (is_tts * total + (1-is_tts) * x_native)[:, None],
	(.64 * total + .27 * x_native)[:, None],
	16000)
	else: # Video from plain (.txt)
	OUT_FILE = 'tmp.mp4'
	x = tts_multi_sentence(text=text,
	precomputed_style_vector=precomputed_style_vector,
	voice=args.voice,
	soundscape=args.soundscape,
	speed=args.speed)
	soundfile.write(AUDIO_TRACK, x, 16000)

	# IMAGE 2 SPEECH

	if args.image is not None:

	# Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios

	STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
	cv2.imwrite(
	STATIC_FRAME,
	resize_with_white_padding(cv2.imread(args.image)
	))

	OUT_FILE = 'tmp.mp4' # args.out_file + '_image_to_speech.mp4'

	# SILENT CLIP

	clip_silent = ImageClip(img=STATIC_FRAME,
	duration=5) # ffmpeg continues this silent video for duration of TTS
	clip_silent.write_videofile(SILENT_VIDEO, fps=24)

	x = tts_multi_sentence(text=text,
	precomputed_style_vector=precomputed_style_vector,
	voice=args.voice,
	soundscape=args.soundscape,
	speed=args.speed
	)
	soundfile.write(AUDIO_TRACK, x, 16000)
	if args.video or args.image:
	# write final output video
	subprocess.run(
	["ffmpeg",
	"-y",
	"-i",
	SILENT_VIDEO,
	"-i",
	AUDIO_TRACK,
	"-c:v",
	"copy",
	"-map",
	"0:v:0",
	"-map",
	" 1:a:0",
	CACHE_DIR + OUT_FILE])

	print(f'\noutput video is saved as {OUT_FILE}')

	else:

	# Fallback: No image nor video provided - do only tts
	x = tts_multi_sentence(text=text,
	precomputed_style_vector=precomputed_style_vector,
	voice=args.voice,
	soundscape=args.soundscape,
	speed=args.speed)
	OUT_FILE = 'tmp.wav'
	soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)

	# audios = [msinference.inference(text,
	# msinference.compute_style(f'voices/{voice}.wav'))]
	# # for t in [text]:
	# output_buffer = io.BytesIO()
	# write(output_buffer, 24000, np.concatenate(audios))
	# response = Response(output_buffer.getvalue())
	# response.headers["Content-Type"] = "audio/wav"
	# https://stackoverflow.com/questions/67591467/
	# flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
	# time.sleep(4)

	# send server's output as default file -> srv_result.xx
	print(f'\n=SERVER saved as {OUT_FILE=}\n')
	response = send_from_directory(CACHE_DIR, path=OUT_FILE)
	response.headers['suffix-file-type'] = OUT_FILE
	print('________________\n ? \n_______________')
	return response


	if __name__ == "__main__":
	app.run(host="0.0.0.0")


	# Concat. .mp4

	# _list.txt
	#
	# file out/som_utasitvany_en_txt.mp4
	# file out/som_utasitvany_hu_txt.mp4
	#
	#
	# subprocess.run(
	# [
	# "ffmpeg",
	# "-f",
	# "concat",
	# '-safe',
	# '0',
	# '-i',
	# '_list.txt',
	# '-c',
	# 'copy',
	# f'fusion.mp4', # save to correct location is handled in client
	# ])
	#
	# ffmpeg -f concat -i mylist.txt -c copy output.mp4