| | import gradio as gr |
| | from sentence_transformers import SentenceTransformer, util |
| | import pandas as pd |
| | from datasets import load_dataset |
| | from annoy import AnnoyIndex |
| | import os |
| |
|
| | try: |
| | |
| | dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test") |
| | df = pd.DataFrame(dataset) |
| |
|
| | |
| | sentences1 = df["sentence1"].tolist() |
| | sentences2 = df["sentence2"].tolist() |
| |
|
| | |
| | model_names = [ |
| | "nickprock/multi-sentence-BERTino", |
| | "nickprock/sentence-bert-base-italian-uncased", |
| | "nickprock/static-similarity-mmarco3m-mrl-BERTino-v1.5", |
| | "nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl", |
| | ] |
| |
|
| | models = {name: SentenceTransformer(name) for name in model_names} |
| | annoy_indexes1 = {} |
| | annoy_indexes2 = {} |
| |
|
| | def build_annoy_index(model_name, sentences): |
| | """Builds an Annoy index for a given model and sentences.""" |
| | model = models[model_name] |
| | embeddings = model.encode(sentences) |
| | embedding_dim = embeddings.shape[1] |
| | annoy_index = AnnoyIndex(embedding_dim, "angular") |
| | for i, embedding in enumerate(embeddings): |
| | annoy_index.add_item(i, embedding) |
| | annoy_index.build(10) |
| | return annoy_index |
| |
|
| | |
| | for model_name in model_names: |
| | annoy_indexes1[model_name] = build_annoy_index(model_name, sentences1) |
| | annoy_indexes2[model_name] = build_annoy_index(model_name, sentences2) |
| |
|
| | def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index): |
| | """Finds the most similar sentence using Annoy.""" |
| | model = models[model_name] |
| | sentence_embedding = model.encode(sentence) |
| | nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1) |
| | best_sentence_index = nearest_neighbors[0] |
| | return sentence_list[best_sentence_index] |
| |
|
| | def calculate_cosine_similarity(sentence1, sentence2, model): |
| | """Calculates the cosine similarity between two sentences.""" |
| | embedding1 = model.encode(sentence1) |
| | embedding2 = model.encode(sentence2) |
| | return util.cos_sim(embedding1, embedding2).item() |
| |
|
| | def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name): |
| | """Compares the results of different models using Annoy.""" |
| | sentence1_results = {} |
| | sentence2_results = {} |
| | similarities = {} |
| |
|
| | sentence1_results[model1_name] = find_similar_sentence_annoy( |
| | sentence, model1_name, sentences1, annoy_indexes1 |
| | ) |
| | sentence1_results[model2_name] = find_similar_sentence_annoy( |
| | sentence, model2_name, sentences1, annoy_indexes1 |
| | ) |
| | sentence1_results[model3_name] = find_similar_sentence_annoy( |
| | sentence, model3_name, sentences1, annoy_indexes1 |
| | ) |
| | sentence1_results[model4_name] = find_similar_sentence_annoy( |
| | sentence, model4_name, sentences1, annoy_indexes1 |
| | ) |
| |
|
| | sentence2_results[model1_name] = find_similar_sentence_annoy( |
| | sentence, model1_name, sentences2, annoy_indexes2 |
| | ) |
| | sentence2_results[model2_name] = find_similar_sentence_annoy( |
| | sentence, model2_name, sentences2, annoy_indexes2 |
| | ) |
| | sentence2_results[model3_name] = find_similar_sentence_annoy( |
| | sentence, model3_name, sentences2, annoy_indexes2 |
| | ) |
| | sentence2_results[model4_name] = find_similar_sentence_annoy( |
| | sentence, model4_name, sentences2, annoy_indexes2 |
| | ) |
| |
|
| | |
| | for model_name in model_names: |
| | similarities[model_name] = calculate_cosine_similarity( |
| | sentence1_results[model_name], sentence2_results[model_name], models[model_name] |
| | ) |
| |
|
| | return sentence1_results, sentence2_results, similarities |
| |
|
| | def format_results(sentence1_results, sentence2_results, similarities): |
| | """Formats the results for display in Gradio.""" |
| | output_text = "" |
| | for model_name in model_names: |
| | output_text += f"**{model_name}**\n" |
| | output_text += ( |
| | f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n" |
| | ) |
| | output_text += ( |
| | f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n" |
| | ) |
| | output_text += f"Cosine Similarity: {similarities[model_name]:.4f}\n\n" |
| | return output_text |
| |
|
| | def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name): |
| | """Gradio interface function.""" |
| | sentence1_results, sentence2_results, similarities = compare_models_annoy( |
| | sentence, model1_name, model2_name, model3_name, model4_name |
| | ) |
| | return format_results(sentence1_results, sentence2_results, similarities) |
| |
|
| | iface = gr.Interface( |
| | fn=gradio_interface, |
| | inputs=[ |
| | gr.Textbox(lines=2, placeholder="Enter your sentence here..."), |
| | gr.Dropdown(model_names, value=model_names[0], label="Model 1"), |
| | gr.Dropdown(model_names, value=model_names[1], label="Model 2"), |
| | gr.Dropdown(model_names, value=model_names[2], label="Model 3"), |
| | gr.Dropdown(model_names, value=model_names[3], label="Model 4"), |
| | ], |
| | outputs=gr.Markdown(), |
| | title="Sentence Transformer Model Comparison (Annoy)", |
| | description=( |
| | "Inserisce una frase e confronta le frasi più simili generate da diversi modelli " |
| | "sentence-transformer (utilizzando Annoy per una ricerca più veloce) sia dalla frase1 " |
| | "che dalla frase2. Calcola anche la similarità del coseno tra le frasi. " |
| | "Utilizza sentence-transformers per l'italiano e lo split test del dataset stsb_multi_mt." |
| | ), |
| | ) |
| |
|
| | iface.launch() |
| |
|
| | except Exception as e: |
| | print(f"Error loading dataset: {e}") |
| | iface = gr.Interface( |
| | fn=lambda: "Dataset loading failed. Check console for details.", |
| | inputs=[], |
| | outputs=gr.Textbox(), |
| | title="Dataset Loading Error", |
| | description="There was an error loading the dataset.", |
| | ) |
| | iface.launch() |