Kullanıcı:Alperen/Arşiv kaynaklarını düzeltme betiği

Aşağıdaki betik, Mediawiki API yardımıyla kaynak kodu elde edilen bir sayfanın üzerinde "Arşivlenmiş kopya" şeklinde başlığı olmadan bırakılmış referansları düzenleyerek bu referanslara başlık (title) ekler. Kodun çalışma süresi sayfada başlıksız olarak (arşivlenmiş kopya başlığı ile) duran bağlantıların sayfaların sayısı ile orantılıdır. Kullanım için Special:BotPasswords sayfasından alacağınız bot şifresi ile kodun içindeki USERNAME, PASSWORD parametrelerini güncelleyiniz, üzerinde işlem yapacağınız sayfayı page_title parametresi ile belirtiniz.

Bilinen kısıtlamalar:

1. PDF linkleri ve bunların başlıkları üzerinde işlem yapılmamaktadır.
2. Şablon parametrelerinin url ile bitişik olduğu durumlarda (url'den sonra | geliyorsa) URL bilgisi hatalı çekilebilmektedir.
3. Bazı durumlarda "Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER." hatası görünebilmektedir. Çoğunlukla hatayı dikkate almadan beklerseniz işlem tamamlanmaktadır.
4. Özellikle Türkçe sayfalarda bulunan sorunlu başlıklar, karakter kodu hataları (Türkçe karakter kodlamasındaki hatalar) düzeltilmemektedir. Bunlar manuel olarak ele alınmalıdır.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re

#!/usr/bin/python3

"""
    login.py

    MediaWiki API Demos
    Demo of `Login` module: Sending post request to login
    MIT license
"""

import requests

USERNAME = "KULLANICI ADINIZI GİRİNİZİ"
PASSWORD = "Özel:BotPasswords sayfasından aldığınız bot şifrenizi giriniz"

S = requests.Session()

URL = "https://www.mediawiki.org/w/api.php"

# Retrieve login token first
PARAMS_0 = {
    'action':"query",
    'meta':"tokens",
    'type':"login",
    'format':"json"
}

R = S.get(url=URL, params=PARAMS_0)
DATA = R.json()

LOGIN_TOKEN = DATA['query']['tokens']['logintoken']

print(LOGIN_TOKEN)

# Send a post request to login. Using the main account for login is not
# supported. Obtain credentials via Special:BotPasswords
# (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword

PARAMS_1 = {
    'action': "login",
    'lgname': USERNAME,
    'lgpassword': PASSWORD,
    'lgtoken': LOGIN_TOKEN,
    'format': "json"
}

R = S.post(URL, data=PARAMS_1)
DATA = R.json()

print(DATA)
assert DATA['login']['result'] == 'Success'

api_url = 'https://tr.wikipedia.org/w/api.php'  # Adjust the URL as needed
page_title = 'DÜZENLEMEK İSTEDİĞNİZ SAYFANIN BAŞLIĞINI YAZINIZ'  # Replace with the title of the Wikipedia page you want

# Define the parameters for the API request
params = {
    'action': 'parse',
    'format': 'json',
    'page': page_title,
    'prop': 'wikitext',
}

# Send a GET request to the Wikipedia API
response = requests.get(api_url, params=params)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    
    # Check if 'parse' exists in the response and has 'wikitext' data
    if 'parse' in data and 'wikitext' in data['parse']:
        wikitext = data['parse']['wikitext']['*']
        print("Wikitext Content:")
        print(wikitext)
    else:
        print("Wikitext not found in the API response.")
else:
    print("API request failed.")


# Function to extract URLs from the wikitext
def extract_urls(wikitext):
    urls = re.findall(r'https://web\.archive\.org/\S+', wikitext)
    return urls


# Extract URLs from the wikitext
page_urls = extract_urls(wikitext)

# Print the list of URLs
print("List of URLs:")
for url in page_urls:
    print(url)

# Maintain a set to track visited links
visited_links = set()

# Maintain a dictionary to store URL titles
url_titles = {}

# Function to fetch the title of a URL
def fetch_title(url):
    try:
        response = requests.get(url, stream=True)  # Use stream=True for streaming response
        response.raise_for_status()

        for chunk in response.iter_content(chunk_size=512):
            soup = BeautifulSoup(chunk, 'html.parser')
            title_tag = soup.title

            if title_tag is not None:
                title = title_tag.string
                if title:
                    return title

        return 'No Title'  # In case the title is not found

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None


# Function to visit a URL, fetch its title, and print the URL with its title
def visit_url_and_fetch_title(url):
    if url in visited_links:
        return
    visited_links.add(url)  # Mark the URL as visited

    title = fetch_title(url)
    if title:
        url_titles[url] = title

# Report the number of links in page_urls
num_links = len(page_urls)
print(f"Number of links in page_urls: {num_links}")

# Visit each URL in page_urls and fetch its title
for url in page_urls:
    visit_url_and_fetch_title(url)

# Display the final result
print("\nFinal Result:")
for url, title in url_titles.items():
    print(f"URL: {url}")
    print(f"Title: {title}\n")

# Iterate through url_titles and replace '-' with '&#124;' in titles
for url, title in url_titles.items():
    url_titles[url] = title.replace('|', '&#124;')

# Function to find and replace the specified pattern
def replace_pattern(match):
    # Extract the URL from the match
    url_matched = match.group(3)

    # Find the corresponding title in url_titles or use 'No Title' if not found
    title = url_titles.get(url_matched, 'No Title')

    # Include the second "any text" after the first one during replacement
    updated_text = match.group(2).replace('Arşivlenmiş kopya', '') + '|' + 'arşivurl=' + match.group(3) 

    # Return the updated text
    return f'başlık={title}|{updated_text}'

# Define the regex pattern to match "başlık=any text|any text|arşivurl=URL"
pattern = r'başlık=([^|]+)\|([^|]+)\|arşivurl=(https://web.archive.org/\S+)'

# Use re.sub with a custom function to replace the pattern
wikitext, replacement_count = re.subn(pattern, replace_pattern, wikitext)

# Display the updated wikitext
print("Updated Wikitext:")
print(wikitext)

# Display the number of replacements made
print("Number of Replacements:", replacement_count)}}