BiLSTM‑CRF for Tatar Morphological Analysis
This model is a BiLSTM‑CRF trained on 80,000 sentences from the Tatar Morphological Corpus. It predicts fine‑grained morphological tags (e.g., N+Sg+Nom, V+PRES(Й)+3SG).
Performance on Test Set
| Metric | Value | 95% CI |
|---|---|---|
| Token Accuracy | 0.9440 | [0.9421, 0.9458] |
| Micro F1 | 0.9440 | [0.9420, 0.9459] |
| Macro F1 | 0.5330 | [0.5149, 0.5519] |
Accuracy by Part of Speech (Top 10)
| POS | Accuracy |
|---|---|
| PUNCT | 1.0000 |
| NOUN | 0.8913 |
| VERB | 0.8725 |
| ADJ | 0.9418 |
| PRON | 0.9900 |
| PART | 0.9982 |
| PROPN | 0.9248 |
| ADP | 1.0000 |
| CCONJ | 0.9992 |
| ADV | 0.9886 |
Usage
Install required packages:
pip install torch torchcrf transformers huggingface_hub
Then load and use the model:
import torch
import json
from torch import nn
from torchcrf import CRF
from huggingface_hub import hf_hub_download
# Define the model class (must match training)
class BiLSTMCRF(nn.Module):
def __init__(self, vocab_size, emb_dim, hid_dim, num_tags, dropout=0.5):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
self.lstm = nn.LSTM(emb_dim, hid_dim // 2, bidirectional=True, batch_first=True, dropout=dropout)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(hid_dim, num_tags)
self.crf = CRF(num_tags, batch_first=True)
def forward(self, input_ids, mask, labels=None):
embeds = self.embedding(input_ids)
lstm_out, _ = self.lstm(embeds)
lstm_out = self.dropout(lstm_out)
emissions = self.classifier(lstm_out)
if labels is not None:
mask = mask.bool()
labels = torch.where(labels == -100, torch.tensor(0, device=labels.device), labels)
return -self.crf(emissions, labels, mask=mask, reduction='mean')
else:
return self.crf.decode(emissions, mask=mask.bool())
# Download required files from Hugging Face
repo_id = "TatarNLPWorld/lstm-tatar-morph"
config_path = hf_hub_download(repo_id, "config.json")
word2id_path = hf_hub_download(repo_id, "word2id.json")
weights_path = hf_hub_download(repo_id, "best_model.pt")
id2tag_path = hf_hub_download(repo_id, "id2tag.json")
# Load hyperparameters
with open(config_path) as f:
config = json.load(f)
with open(word2id_path) as f:
word2id = json.load(f)
with open(id2tag_path) as f:
id2tag = {int(k): v for k, v in json.load(f).items()}
# Instantiate model and load weights
model = BiLSTMCRF(
vocab_size=len(word2id),
emb_dim=config['embedding_dim'],
hid_dim=config['hidden_dim'],
num_tags=config['num_labels'],
dropout=config.get('dropout', 0.5)
)
model.load_state_dict(torch.load(weights_path, map_location='cpu'), strict=False)
model.eval()
def predict(tokens, max_len=128):
ids = [word2id.get(w, word2id['<UNK>']) for w in tokens]
mask = [1] * len(ids)
orig_len = len(ids)
if len(ids) > max_len:
ids = ids[:max_len]
mask = mask[:max_len]
tokens = tokens[:max_len]
else:
ids += [0] * (max_len - len(ids))
mask += [0] * (max_len - len(mask))
input_ids = torch.tensor([ids], dtype=torch.long)
mask_tensor = torch.tensor([mask], dtype=torch.long)
with torch.no_grad():
preds = model(input_ids, mask_tensor)[0]
preds = preds[:orig_len]
return [id2tag[p] for p in preds]
# Example
tokens = ["Татар", "теле", "бик", "бай", "."]
tags = predict(tokens)
for token, tag in zip(tokens, tags):
print(f"{token} -> {tag}")
Expected output:
Татар -> N+Sg+Nom
теле -> N+Sg+POSS_3(СЫ)+Nom
бик -> Adv
бай -> Adj
. -> PUNCT
Citation
If you use this model, please cite it as:
@misc{arabov-lstm-tatar-morph-2026,
title = {BiLSTM‑CRF for Tatar Morphological Analysis},
author = {Arabov Mullosharaf Kurbonovich},
year = {2026},
publisher = {Hugging Face},
url = {https://huggingface.co/TatarNLPWorld/lstm-tatar-morph}
}
License
Apache 2.0
- Downloads last month
- 30