1. Anuncie Aqui ! Entre em contato fdantas@4each.com.br

[Python] NLP - Keyword Extraction Algorithm Accuracy

Discussão em 'Python' iniciado por Stack, Novembro 5, 2024 às 13:22.

  1. Stack

    Stack Membro Participativo

    I've been researching NLP models like Rake, Keybert, Spacy and etc. The task that I have is to do a simple keyword extraction which models like Rake and Keybert have no problems with. But I saw products like NeuronWriter and SurferSEO which seem to be using significantly more complicated models. What are they build upon and how are they so accurate for so many languages? None of the models that I've encounter come close to the relevance that the algorithms of SurferSEO and NeuronWriter provide

    Here's the code that I am working with:

    import networkx as nx
    import re
    import json
    from collections import Counter
    from goose3 import Goose
    from itertools import combinations
    from math import ceil

    BULGARIAN_STOPWORDS = ["а","автентичен",.....]

    def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\u0400-\u04FF]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

    def is_cyrillic(word):
    return bool(re.search('[\u0400-\u04FF]', word))

    def get_words(text):
    words = text.split()
    return [word for word in words if is_cyrillic(word) and len(word) > 2 and word not in BULGARIAN_STOPWORDS]

    def get_phrases(words, max_words=3):
    phrases = []
    for i in range(len(words)):
    for n in range(1, max_words + 1):
    if i + n <= len(words):
    phrase = " ".join(words[i:i + n])
    if phrase and all(is_cyrillic(w) for w in phrase.split()):
    phrases.append(phrase)
    return phrases

    def build_graph(phrases, window_size=4):
    graph = nx.Graph()

    for phrase in phrases:
    if not graph.has_node(phrase):
    graph.add_node(phrase)

    for i in range(len(phrases) - window_size + 1):
    window = phrases[i:i + window_size]
    for phrase1, phrase2 in combinations(window, 2):
    if phrase1 != phrase2:
    if graph.has_edge(phrase1, phrase2):
    graph[phrase1][phrase2]['weight'] += 1
    else:
    graph.add_edge(phrase1, phrase2, weight=1)
    return graph

    def extract_keywords_from_urls(urls, top_n=10):
    g = Goose({'language': 'bg'})
    all_texts = []
    keyword_occurrences = Counter()

    for url in urls:
    try:
    article = g.extract(url=url)
    title = article.title
    text = article.cleaned_text
    full_text = clean_text(title + " " + text)
    all_texts.append(full_text)
    except Exception as e:
    print(f"Error processing {url}: {str(e)}")
    continue

    g.close()

    combined_phrases = []
    print(all_texts)
    for text in all_texts:
    words = get_words(text)
    if not words:
    continue
    phrases = get_phrases(words, max_words=3)
    combined_phrases.extend(phrases)

    if not combined_phrases:
    return []

    graph = build_graph(combined_phrases)

    if len(graph.nodes()) == 0:
    return []

    scores = nx.pagerank(graph)
    keywords = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    keywords = [phrase for phrase, score in keywords if 1 <= len(phrase.split()) <= 3][:top_n]

    # Count keyword occurrences in all URLs and calculate the average
    keyword_data = []
    for keyword in keywords:
    total_count = sum(text.count(keyword) for text in all_texts)
    avg_count = total_count / len(all_texts)
    keyword_data.append({
    "keyword": keyword,
    "avg": max(1, ceil(avg_count))
    })

    return keyword_data

    def get_keywords(urls, top_n=50):
    try:
    keywords = extract_keywords_from_urls(urls, top_n)
    return keywords
    except Exception as e:
    print(f"Error extracting keywords: {str(e)}")
    return []

    if __name__ == "__main__":
    urls = [
    "https://dermavita.bg/protzedura/lazerna-epilatziya/",
    "https://incanto.center/service/lazerna-epilaciq-inkanto/",
    "https://derma-act.bg/p/lazerna-epilatsia/",

    ]

    keywords = get_keywords(urls)
    print(json.dumps(keywords, ensure_ascii=False, indent=4))

    Continue reading...

Compartilhe esta Página