| | import joblib |
| | from transformers import AutoFeatureExtractor, WavLMModel |
| | import torch |
| | import soundfile as sf |
| | import numpy as np |
| | import gradio as gr |
| | import librosa |
| |
|
| | class HuggingFaceFeatureExtractor: |
| | def __init__(self, model_class, name): |
| | self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| | self.feature_extractor = AutoFeatureExtractor.from_pretrained(name) |
| | self.model = model_class.from_pretrained(name) |
| | self.model.eval() |
| | self.model.to(self.device) |
| |
|
| | def __call__(self, audio, sr): |
| | inputs = self.feature_extractor( |
| | audio, |
| | sampling_rate=sr, |
| | return_tensors="pt", |
| | padding=True, |
| | ) |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | return outputs.last_hidden_state |
| |
|
| | FEATURE_EXTRACTORS = { |
| | "wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"), |
| | "wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V1"), |
| | "wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V2"), |
| | "wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-Deepfake_V3"), |
| | } |
| |
|
| | model1 = joblib.load('model1.joblib') |
| | model2 = joblib.load('model2.joblib') |
| | model3 = joblib.load('model3.joblib') |
| | model4 = joblib.load('model4.joblib') |
| | final_model = joblib.load('final_model.joblib') |
| |
|
| | def process_audio(file_audio): |
| | audio, sr = librosa.load(file_audio, sr=16000) |
| |
|
| | if len(audio.shape) > 1: |
| | audio = audio[0] |
| |
|
| | extractor_1 = FEATURE_EXTRACTORS['wavlm-base']() |
| | extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']() |
| | extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']() |
| | extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']() |
| |
|
| | eval1 = extractor_1(audio, sr) |
| | eval1 = torch.mean(eval1, dim=1).cpu().numpy() |
| |
|
| | eval2 = extractor_2(audio, sr) |
| | eval2 = torch.mean(eval2, dim=1).cpu().numpy() |
| |
|
| | eval3 = extractor_3(audio, sr) |
| | eval3 = torch.mean(eval3, dim=1).cpu().numpy() |
| |
|
| | eval4 = extractor_4(audio, sr) |
| | eval4 = torch.mean(eval4, dim=1).cpu().numpy() |
| |
|
| | eval1 = eval1.reshape(1, -1) |
| | eval2 = eval2.reshape(1, -1) |
| | eval3 = eval3.reshape(1, -1) |
| | eval4 = eval4.reshape(1, -1) |
| |
|
| | eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1) |
| | eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1) |
| | eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1) |
| | eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1) |
| |
|
| | eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4)) |
| |
|
| | final_prob = final_model.predict_proba(eval_combined_probs)[:, 1] |
| |
|
| | if final_prob < 0.5: |
| | return f"Fake with a confidence of: {100 - final_prob[0] * 100:.2f}%" |
| | else: |
| | return f"Real with a confidence of: {final_prob[0] * 100:.2f}%" |
| |
|
| | interface = gr.Interface( |
| | fn=process_audio, |
| | inputs=gr.Audio(type="filepath"), |
| | outputs="text", |
| | title="Audio Deepfake Detection", |
| | description="Upload an audio file to detect whether it is fake or real. The system uses features ensamble from wavLM base and finetuned versions. Submitted to ASVSpoof5.", |
| | ) |
| |
|
| | interface.launch(share=True) |
| |
|