| | import spacy |
| | import re |
| | from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer |
| | import torch |
| | from typing import List |
| | import os |
| | from datetime import datetime |
| |
|
| |
|
| | |
| |
|
| | def split_text_recursively(text): |
| | if '\n' not in text: |
| | return [text] |
| | parts = text.split('\n', 1) |
| | return [parts[0]] + split_text_recursively(parts[1]) |
| |
|
| | def parse_post(path): |
| |
|
| | nlp = spacy.load("en_core_web_sm") |
| |
|
| | |
| |
|
| | with open(path, 'r') as file: |
| | text = file.read() |
| |
|
| | |
| |
|
| | str_list = split_text_recursively(text) |
| | str_list = [i.strip() for i in str_list] |
| | str_list = list(filter(None, str_list)) |
| |
|
| | count = 0 |
| | sents = [] |
| |
|
| | for line in str_list: |
| | doc = nlp(line) |
| | for sent in doc.sents: |
| | print(f"{sent.text}") |
| | sents.append(sent.text) |
| | |
| | return sents |
| |
|
| |
|
| | |
| |
|
| | from torch.utils.data import DataLoader |
| | import torch.nn as nn |
| | from transformers import DataCollatorForTokenClassification |
| | from typing import List, Tuple |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction") |
| | model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction") |
| |
|
| | id2label = model.config.id2label |
| | label2id = model.config.label2id |
| |
|
| | def pad(list_of_lists, pad_value=0): |
| | max_len = max(len(lst) for lst in list_of_lists) |
| |
|
| | |
| | padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists] |
| | attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists] |
| | |
| | return torch.tensor(padded_lists), torch.tensor(attention_masks) |
| |
|
| | def collate_fn(batch: List[List[torch.Tensor]]): |
| |
|
| | input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch))) |
| | tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]]) |
| | return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask} |
| |
|
| | def extract_spans(B_mask, I_mask, token_ids, tokenizer): |
| | """ |
| | Extract text spans for 2D tensors (batch of sequences). |
| | """ |
| | batch_size = B_mask.size(0) |
| | all_spans = [] |
| |
|
| | d = tokenizer.decode |
| |
|
| | for batch_idx in range(batch_size): |
| | spans = [] |
| | current_span = [] |
| |
|
| | for i in range(B_mask.size(1)): |
| | if B_mask[batch_idx, i].item() == 1: |
| | if current_span: |
| | spans.append(current_span) |
| | print(d(current_span)) |
| | current_span = [token_ids[batch_idx, i].item()] |
| | print(d(current_span)) |
| | elif I_mask[batch_idx, i].item() == 1 and current_span: |
| | print(d(current_span)) |
| | current_span.append(token_ids[batch_idx, i].item()) |
| | else: |
| | print(d(current_span)) |
| | if current_span: |
| | spans.append(current_span) |
| | current_span = [] |
| |
|
| | if current_span: |
| | spans.append(current_span) |
| |
|
| | |
| | decoded_spans = [tokenizer.decode(span, skip_special_tokens=True) for span in spans] |
| | all_spans.append(decoded_spans) |
| |
|
| | |
| | all_spans = list(filter(lambda x: x != [], all_spans)) |
| |
|
| | return all_spans |
| |
|
| |
|
| | def concat_subtokens(tokens): |
| | result = [] |
| | |
| | for token in tokens: |
| | if token.startswith('##'): |
| | |
| | result[-1] += token[2:] |
| | else: |
| | |
| | result.append(token) |
| | |
| | return result |
| |
|
| | def merge_spans(batch_spans, tokenizer): |
| |
|
| | batch_decoded_spans = [] |
| |
|
| | for spans in batch_spans: |
| |
|
| | |
| |
|
| | if spans[0].startswith('##'): |
| | continue |
| |
|
| | decoded_spans = [] |
| | for token in spans: |
| | if token.startswith('##'): |
| | |
| | decoded_spans[-1] += token[2:] |
| | else: |
| | |
| | decoded_spans.append(token) |
| |
|
| | |
| | |
| | for span in decoded_spans: |
| | batch_decoded_spans.append(span) |
| |
|
| | return batch_decoded_spans |
| |
|
| |
|
| | def extract_skills(batch_sentences: List[str]): |
| |
|
| | print('Extracting skills from job posting...') |
| |
|
| | |
| |
|
| | |
| | batch = tokenizer(batch_sentences, padding=True, truncation=True) |
| | batch_tokens = torch.tensor(batch['input_ids']) |
| | batch_attention_masks = torch.tensor(batch['attention_mask']) |
| |
|
| | model.eval() |
| | with torch.no_grad(): |
| | output = model(input_ids=batch_tokens, attention_mask=batch_attention_masks) |
| |
|
| | |
| | pred = output.logits.argmax(-1) |
| | pred = torch.where(batch_attention_masks==0, torch.tensor(-100), pred) |
| |
|
| | b_mask = torch.where(pred==0, 1, 0) |
| | i_mask = torch.where(pred==1, 1, 0) |
| |
|
| | spans = extract_spans(b_mask, i_mask, batch_tokens, tokenizer) |
| | decoded_spans = merge_spans(spans, tokenizer) |
| |
|
| | return decoded_spans |
| |
|
| | def skills_save(path,skills): |
| | with open(path, 'w') as f: |
| | for i, skill in enumerate(skills): |
| | if i == len(skills) - 1: |
| | f.write(f"{skill}") |
| | else: |
| | f.write(f"{skill}\n") |
| |
|
| |
|
| | def backfill(): |
| |
|
| | job_dir = os.path.join(os.getcwd(), 'job-postings') |
| | tag_dir = os.path.join(os.getcwd(), 'tags') |
| |
|
| | for date in os.listdir(job_dir): |
| | print(f"Processing date directory: {date}") |
| | |
| | job_date = os.path.join(job_dir, date) |
| | tag_date = os.path.join(tag_dir, date) |
| |
|
| | for job in os.listdir(job_date): |
| | job_path = os.path.join(job_date, job) |
| | tag_path = os.path.join(tag_date, job) |
| |
|
| | print(f"Processing job file: {job_path}") |
| |
|
| | if not os.path.exists(tag_date): |
| | os.makedirs(tag_date) |
| | print(f"Created directory: {tag_date}") |
| |
|
| | sents = parse_post(job_path) |
| | skills = extract_skills(sents) |
| | skills_save(tag_path, skills) |
| |
|
| | print(f"Saved skills to: {tag_path}") |
| |
|
| | def tag_date(date): |
| |
|
| | tag_dir = os.path.join(os.getcwd(), 'tags', date) |
| | job_dir = os.path.join(os.getcwd(), 'job-postings', date) |
| |
|
| | for job in os.listdir(job_dir): |
| | |
| | job_path = os.path.join(job_dir, job) |
| | tag_path = os.path.join(tag_dir, job) |
| |
|
| | print(f"Processing job file: {job_path}") |
| |
|
| | if not os.path.exists(tag_dir): |
| | os.makedirs(tag_dir) |
| | print(f"Created directory: {tag_dir}") |
| |
|
| | sents = parse_post(job_path) |
| | skills = extract_skills(sents) |
| | skills_save(tag_path, skills) |
| |
|
| | print(f"Saved skills to: {tag_path}") |
| |
|
| | if __name__ == '__main__': |
| |
|
| | |
| | |
| |
|
| | |
| | date = datetime.today().strftime('%m-%d-%Y') |
| | tag_date(date) |