[Python] NLP - Keyword Extraction Algorithm Accuracy

Stack · Novembro 5, 2024 às 13:22

I've been researching NLP models like Rake, Keybert, Spacy and etc. The task that I have is to do a simple keyword extraction which models like Rake and Keybert have no problems with. But I saw products like NeuronWriter and SurferSEO which seem to be using significantly more complicated models. What are they build upon and how are they so accurate for so many languages? None of the models that I've encounter come close to the relevance that the algorithms of SurferSEO and NeuronWriter provide

Here's the code that I am working with:

import networkx as nx
import re
import json
from collections import Counter
from goose3 import Goose
from itertools import combinations
from math import ceil

BULGARIAN_STOPWORDS = ["а","автентичен",.....]

def clean_text(text):
text = text.lower()
text = re.sub(r'[^\w\s\u0400-\u04FF]', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()

def is_cyrillic(word):
return bool(re.search('[\u0400-\u04FF]', word))

def get_words(text):
words = text.split()
return [word for word in words if is_cyrillic(word) and len(word) > 2 and word not in BULGARIAN_STOPWORDS]

def get_phrases(words, max_words=3):
phrases = []
for i in range(len(words)):
for n in range(1, max_words + 1):
if i + n <= len(words):
phrase = " ".join(words[i:i + n])
if phrase and all(is_cyrillic(w) for w in phrase.split()):
phrases.append(phrase)
return phrases

def build_graph(phrases, window_size=4):
graph = nx.Graph()

for phrase in phrases:
if not graph.has_node(phrase):
graph.add_node(phrase)

for i in range(len(phrases) - window_size + 1):
window = phrases[i:i + window_size]
for phrase1, phrase2 in combinations(window, 2):
if phrase1 != phrase2:
if graph.has_edge(phrase1, phrase2):
graph[phrase1][phrase2]['weight'] += 1
else:
graph.add_edge(phrase1, phrase2, weight=1)
return graph

def extract_keywords_from_urls(urls, top_n=10):
g = Goose({'language': 'bg'})
all_texts = []
keyword_occurrences = Counter()

for url in urls:
try:
article = g.extract(url=url)
title = article.title
text = article.cleaned_text
full_text = clean_text(title + " " + text)
all_texts.append(full_text)
except Exception as e:
print(f"Error processing {url}: {str(e)}")
continue

g.close()

combined_phrases = []
print(all_texts)
for text in all_texts:
words = get_words(text)
if not words:
continue
phrases = get_phrases(words, max_words=3)
combined_phrases.extend(phrases)

if not combined_phrases:
return []

graph = build_graph(combined_phrases)

if len(graph.nodes()) == 0:
return []

scores = nx.pagerank(graph)
keywords = sorted(scores.items(), key=lambda x: x[1], reverse=True)
keywords = [phrase for phrase, score in keywords if 1 <= len(phrase.split()) <= 3][:top_n]

# Count keyword occurrences in all URLs and calculate the average
keyword_data = []
for keyword in keywords:
total_count = sum(text.count(keyword) for text in all_texts)
avg_count = total_count / len(all_texts)
keyword_data.append({
"keyword": keyword,
"avg": max(1, ceil(avg_count))
})

return keyword_data

def get_keywords(urls, top_n=50):
try:
keywords = extract_keywords_from_urls(urls, top_n)
return keywords
except Exception as e:
print(f"Error extracting keywords: {str(e)}")
return []

if __name__ == "__main__":
urls = [
"https://dermavita.bg/protzedura/lazerna-epilatziya/",
"https://incanto.center/service/lazerna-epilaciq-inkanto/",
"https://derma-act.bg/p/lazerna-epilatsia/",

]

keywords = get_keywords(urls)
print(json.dumps(keywords, ensure_ascii=False, indent=4))

Continue reading...

Logar ou Criar uma Conta

[Python] NLP - Keyword Extraction Algorithm Accuracy

Stack Membro Participativo

Compartilhe esta Página

Logar ou Criar uma Conta

[Python] NLP - Keyword Extraction Algorithm Accuracy

Stack Membro Participativo

Compartilhe esta Página

Pesquisas Úteis