BiLSTM‑CRF for Tatar Morphological Analysis

This model is a BiLSTM‑CRF trained on 80,000 sentences from the Tatar Morphological Corpus. It predicts fine‑grained morphological tags (e.g., N+Sg+Nom, V+PRES(Й)+3SG).

Performance on Test Set

Metric	Value	95% CI
Token Accuracy	0.9440	[0.9421, 0.9458]
Micro F1	0.9440	[0.9420, 0.9459]
Macro F1	0.5330	[0.5149, 0.5519]

Accuracy by Part of Speech (Top 10)

POS	Accuracy
PUNCT	1.0000
NOUN	0.8913
VERB	0.8725
ADJ	0.9418
PRON	0.9900
PART	0.9982
PROPN	0.9248
ADP	1.0000
CCONJ	0.9992
ADV	0.9886

Usage

Install required packages:

pip install torch torchcrf transformers huggingface_hub

Then load and use the model:

import torch
import json
from torch import nn
from torchcrf import CRF
from huggingface_hub import hf_hub_download

# Define the model class (must match training)
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_tags, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim // 2, bidirectional=True, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hid_dim, num_tags)
        self.crf = CRF(num_tags, batch_first=True)

    def forward(self, input_ids, mask, labels=None):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.classifier(lstm_out)
        if labels is not None:
            mask = mask.bool()
            labels = torch.where(labels == -100, torch.tensor(0, device=labels.device), labels)
            return -self.crf(emissions, labels, mask=mask, reduction='mean')
        else:
            return self.crf.decode(emissions, mask=mask.bool())

# Download required files from Hugging Face
repo_id = "TatarNLPWorld/lstm-tatar-morph"
config_path = hf_hub_download(repo_id, "config.json")
word2id_path = hf_hub_download(repo_id, "word2id.json")
weights_path = hf_hub_download(repo_id, "best_model.pt")
id2tag_path = hf_hub_download(repo_id, "id2tag.json")

# Load hyperparameters
with open(config_path) as f:
    config = json.load(f)

with open(word2id_path) as f:
    word2id = json.load(f)

with open(id2tag_path) as f:
    id2tag = {int(k): v for k, v in json.load(f).items()}

# Instantiate model and load weights
model = BiLSTMCRF(
    vocab_size=len(word2id),
    emb_dim=config['embedding_dim'],
    hid_dim=config['hidden_dim'],
    num_tags=config['num_labels'],
    dropout=config.get('dropout', 0.5)
)
model.load_state_dict(torch.load(weights_path, map_location='cpu'), strict=False)
model.eval()

def predict(tokens, max_len=128):
    ids = [word2id.get(w, word2id['<UNK>']) for w in tokens]
    mask = [1] * len(ids)
    orig_len = len(ids)
    
    if len(ids) > max_len:
        ids = ids[:max_len]
        mask = mask[:max_len]
        tokens = tokens[:max_len]
    else:
        ids += [0] * (max_len - len(ids))
        mask += [0] * (max_len - len(mask))
    
    input_ids = torch.tensor([ids], dtype=torch.long)
    mask_tensor = torch.tensor([mask], dtype=torch.long)
    
    with torch.no_grad():
        preds = model(input_ids, mask_tensor)[0]
    
    preds = preds[:orig_len]
    return [id2tag[p] for p in preds]

# Example
tokens = ["Татар", "теле", "бик", "бай", "."]
tags = predict(tokens)
for token, tag in zip(tokens, tags):
    print(f"{token} -> {tag}")

Expected output:

Татар -> N+Sg+Nom
теле -> N+Sg+POSS_3(СЫ)+Nom
бик -> Adv
бай -> Adj
. -> PUNCT

Citation

If you use this model, please cite it as:

@misc{arabov-lstm-tatar-morph-2026,
  title = {BiLSTM‑CRF for Tatar Morphological Analysis},
  author = {Arabov Mullosharaf Kurbonovich},
  year = {2026},
  publisher = {Hugging Face},
  url = {https://huggingface.co/TatarNLPWorld/lstm-tatar-morph}
}

License

Apache 2.0

Downloads last month: 30