사용자:Klutzy:bot/script

위키백과 문서를 자동으로 관리하기 위한 스크립트 모음입니다.
pywikipedia를 기반으로 동작합니다. pywikipedia 폴더 내에 .py 파일을 하나 만들어서 실행해주세요.
가능한 한 파이썬 2.6 이상에서. 2.4 이하에서는 문법 오류가 날 가능성이 큽니다.
모든 코드는 저작권이 없는 퍼블릭 도메인으로 배포합니다.
물론 코드가 잘못되어도 책임은 없습니다. --;;
버그가 있으면 사용자토론:klutzy에 알려주세요. 아니면 직접 고쳐주신다면 감사!

자동목록

# -*- coding: utf-8  -*-
import random
import wikipedia, catlib

def list_shuffle(list, N):
    if len(list)<=N: return list
    return random.sample(list, N)

def cat_list_rec(top_cat, ignore_cats=None):
    if ignore_cats==None: ignore_cats = []

    articlelist = [] # assume that top_cat has no article
    cat = catlib.Category(wikipedia.getSite(), top_cat)
    for subcat in cat.subcategories():
        if not subcat.title() in ignore_cats:
            for page in subcat.articles():
                if not ":" in page.title():
                    articlelist.append(page.title())
    return articlelist

def cat_list(top_cat):
    cat = catlib.Category(wikipedia.getSite(), top_cat)
    return [i.title() for i in cat.articles() if not ":" in i.title()]

def main():
    count = 10

    stub_ignore_cats = [
        u'분류:숫자에 관한 토막글',
        #u'분류:대학교에 관한 토막글',
        #u'분류:학교에 관한 토막글',
        u'분류:토막글 분류가 잘못된 문서',
    ]
    
    other_cats = [
        (u'정리 필요', u'분류:전체에 정리가 필요한 글'),
        (u'POV', u'분류:중립성에 이의가 제기된 문서'),
        (u'합병', u'분류:합쳐야 할 문서'),
        (u'위키화', u'분류:위키화가 필요한 문서'),
        (u'외톨이', u'분류:모든 외톨이 글'),
        (u'저명성', u'분류:저명성의 입증이 요구되는 문서'),
    ]
    
    stub_list = cat_list_rec(u'분류:토막글 분류', stub_ignore_cats)
    
    data = []
    for title, catname in other_cats:
        data.append((title, catname, cat_list(catname)))
    
    result = u""
    result += u"<!-- 이 문서의 목록은 봇을 통해 자동으로 변경됩니다. -->\n"
    result += u"""* '''[[위키백과:토막글|토막글]]''': """
    result += u", ".join([u"[[%s]]"%i for i in list_shuffle(stub_list, count)]) + u"\n"

    for title, catname, lis in data:
        result += u"""* '''[[:%s|%s]]''': """%(catname, title)
        result += u", ".join([u"[[%s]]"%i for i in list_shuffle(lis, count)]) + u"\n"
    
    page = wikipedia.Page(wikipedia.getSite(), u'위키백과:사용자 모임/자동목록')
    page.put(result, u"로봇:자동목록 갱신")
    
    print result
    print
    print "# of stubs:", len(stub_list)
    for i, m, j in data:
        print "# of %s"%i, len(j)
    
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()

동음이의 목록

위키백과:동음이의어 문서의 목록, 위키백과:(동음이의) 문서의 목록 자동관리용

# -*- encoding: utf-8  -*-

import wikipedia, pagegenerators, catlib
from collections import defaultdict

def title_key(title):
    if isinstance(title, str): title = title.decode('utf-8')
    cho = (ord(title[0]) - ord(u'가'))/588
    if 0 <= cho and cho < 19:
        return u'ㄱㄱㄴㄷㄷㄹㅁㅂㅂㅅㅅㅇㅈㅈㅊㅋㅌㅍㅎ'[cho]
    else: return u'기타'

def combine(data, key, f):
    data.sort()
    res = u""
    res += u"== %s ==\n"%key
    res += u" -\n".join(f(i) for i in data)
    return res

def output(data, f):
    res = u""
    for key in sorted(data.keys()): # ㄱ, ㄴ, ..., ㅍ, ㅎ, 기타
        lis = data[key]
        res += combine(lis, key, f)
        res += u"\n\n"
    return res

def write_page(title, data, sub=False):
    page = wikipedia.Page(wikipedia.getSite(), title)
    text = page.get()

    head = text.split(u"==", 1)[0]
    tail = u""
    if not sub: # 서브페이지는 푸터가 없는 걸로 간주
        tailkey = u"[[분류:"
        tail = tailkey + text.split(tailkey, 1)[1]
    
    new_text = head + data + tail
    page.put(new_text, u"로봇:동음이의 목록 자동 갱신")
    
def main():
    u"""
    [[위키백과:동음이의어 문서의 목록]],
    [[위키백과:(동음이의) 문서의 목록]]을
    자동으로 관리합니다.

    문서에서 "=="로 시작하는 곳부터 "[[분류:"가 나오기 직전의
    모든 부분을 실제 편집하는 공간으로 간주합니다.
    """
    data1 = defaultdict(list) # 동음이의어
    data2 = defaultdict(list) # (동음이의)
    
    title = u"분류:모든 동음이의어 문서"
    cat = catlib.Category(wikipedia.getSite(), title)
    for page in cat.articles():
        title = page.title()
        key = title_key(title)

        t = u" (동음이의)"
        if title.endswith(t):
            title = title[:-len(t)]
            data2[key].append(title)
        else:
            data1[key].append(title)
    
    for key in data1.keys():
        title = u"위키백과:동음이의어 문서의 목록/%s"%key
        res = combine(data1[key], key, lambda s: u"[[%s]]"%s)
        write_page(title, res, sub=True)

    res = output(data2, lambda s: u"[[%s (동음이의)|%s]]"%(s,s))
    write_page(u"위키백과:(동음이의) 문서의 목록", res)

try:
    main()
finally:
    wikipedia.stopme()

오늘의 그림

틀:오늘의 그림, 틀:오늘의 그림/설명 갱신용

현재 작동 오류가 나고 있습니다. commons:User:Kwj2772/potdlist를 대신 사용해주세요.

#-*- encoding: utf-8  -*-

import time
import re

import wikipedia, catlib

wp = wikipedia.getSite()
co = wikipedia.getSite("commons", "commons")

def get_head_tail(page):
    # 데이터가 있는 문서는 다음과 같은 구조로 되어있다고 가정한다.
    # 
    # {{#switch: ...
    # |1=첫번째
    # |2=두번째
    # ...
    # |
    # }}
    # 나머지부분
    # 
    # 여기에서 첫부분 |1=, 마지막부분 |을 잘라서 사용. 나머지는 머리 꼬리
    content = page.get()
    head = content.split("|1=",1)[0]
    tail = content.split("|\n", 1)[1]
    return head, tail

def pack_page(page, items):
    head, tail = get_head_tail(page)

    body = u""
    i = 1 # :(
    for item in items:
        body += u"|%d=%s\n"%(i, item)
        i += 1
    body += u"|\n"

    data = head + body + tail
    page.put(data, u"robot: 자동 갱신")

def normalize(text):
    u"""commons 설명 문서를 ko에 맞도록 변환한다.
    * [[일반링크]] -> [[commons:일반링크]]"""
    def func(m):
        g = m.group(1)
        if g[0]==':': g = g[1:]
        a,b,c = g.partition(':')
        if b and (len(a)<=2 or a.lower()=='image'):
            g = ':'+g
        else:
            g = 'commons:'+g
        return "[["+g+"]]"
    return re.sub(r"\[\[(.+?)\]\]", func, text)

def main():
    lang = u'ko'
    lang_fallback = u'en'
    potd = u"Template:Potd/%d-%.2d-%.2d"
    potd_desc = u"Template:Potd/%d-%.2d-%.2d (%s)" # YYYY-MM-DD (ko)

    year, month = time.localtime()[:2] # time.gmtime() to use GMT
    
    img_page = wikipedia.Page(wp, u"틀:오늘의 그림")
    desc_page = wikipedia.Page(wp, u"틀:오늘의 그림/설명")

    img_list = []
    desc_list = []
    
    for i in range(1, 32):
        try:
            img = wikipedia.Page(co, potd%(year, month, i)).get()
            img = img.split('<noinclude>', 1)[0] # XXX
        except wikipedia.NoPage:
            break
        
        desc = None
        try:
            desc = wikipedia.Page(co, potd_desc%(year, month, i, lang)).get()
        except wikipedia.NoPage:
            desc = wikipedia.Page(co, potd_desc%(year, month, i, lang_fallback)).get()
        desc = desc.split('<noinclude>', 1)[0] # XXX DUP
        desc = normalize(desc)

        print (u"%d: %s"%(i, img)).encode('ascii', 'replace')

        img_list.append(img)
        desc_list.append(desc)
    
    pack_page(img_page, img_list)
    pack_page(desc_page, desc_list)

    return True
    
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()

보존문서

월별로 나눠지는 문서들에 대해, 목차를 모아서 년도별 문서로 저장합니다.

사용법

wpko_archive.py # 올해 사랑방 문서를 보존
wpko_archive.py -y 2008 -a "위키백과:사용자 관리 요청" -f # 2008년 사용자 관리 요청을 보존. 중간에 끊겨도 계속 진행
- -y: 연도
- -a: 보존할 문서명
- -f: 월별 문서가 없는 것 같아도 상관없이 계속 진행
- -t: /xxxx년 문서 대신에 어떤 특정 문서에 저장하는 흑마법.
- -x: 문서에 저장하지 않고 직접 파일로 저장하는 흑마법. 나중에 제대로 된 옵션으로 정리해야 함

주의사항

보존문서는 "보존할_문서명/xxxx년", 즉 위키백과:사랑방/2009년과 같은 식으로 동작합니다.
목차(TOC)가 없으면 글이 없는 것으로 간주해 멈춥니다. 문단이 몇 개 없는 문서의 경우 제대로 처리가 되지 않을 수 있습니다. 또한 각 월별 문서가 중간부터 존재하는 경우에도 제대로 동작하지 않을 수 있습니다. 이 경우에는 -f를 붙여주세요.
사랑방에 한해서 2009년부터 월별 대신 주별 보존을 시행합니다.

# -*- coding: utf-8  -*-

u"""자동 보존문서 색인 생성"""

import re
import urllib

import wikipedia
site = wikipedia.getSite()

def get_toc(pagename):
    # wikitext가 아니라 html 결과물을 읽어 가져온다.
    # wikitext의 "== 제목 =="을 가져올 경우 제목의 위키문법을 제거하는 과정이 추가적으로 필요하다.
    try:
        ret = []
        text = site.getUrl(site.get_address(pagename.encode('utf-8')))
        r = re.compile(r"\<li.+?toclevel\-1.+?href=\"(.*?)\".+?toctext.+?\>(.*?)\<\/span\>")
        for href, title in r.findall(text):
            ret.append((pagename + href, title))
        return ret
    except :
        return []

def usage():
    print "wpko_archive.py"
    print "usage: read code"

def main():
    import sys, getopt
    import time

    import config
    config.console_encoding

    year = time.localtime()[0]
    testing_mode = False
    archive = u'위키백과:사랑방'
    force_run = False
    
    xxx = False

    argv = [i.decode(config.console_encoding) for i in sys.argv[1:]] # @@#@#$@#$

    try:
        opts, args = getopt.getopt(argv, "ta:y:fx", ["test", "archive-target=", "year="])
    except:
        usage()
        return

    for o, a in opts:
        if o in ("-t", "test"):
            testing_mode = True
        elif o in ("-a", "--archive-target"):
            archive = a
        elif o in ("-y", "--year"):
            year = int(a)
        elif o=='-f':
            force_run = True
        elif o=='-x':
            xxx = True
    
    archive_page = u'%s/%d년'%(archive, year)
    if testing_mode:
        archive_page = u'사용자토론:Klutzy/temp3'

    result = u""

    result += u"""{{%s/보존}}

%d년 %s 색인 목록입니다.
__TOC__
"""%(archive, year, archive.split(':', 1)[1])

    page_template = u"%s/%d년 %d월"
    N = 12
    if archive==u'위키백과:사랑방' and year>2008:
        page_template = u"%s/%d년 제%d주"
        N = 52

    print u"작업 대상:", archive
    print u"범위: %s ~ %s"%(page_template%(archive, year, 1), page_template%(archive, year, N))
    print u"저장할 곳:", archive_page
    print "XXX" if xxx else ""

    section = page_template.split('/', 1)[1]
    for index in range(1, N+1):
        p = page_template%(archive, year, index)
        print "page:", p

        toc = get_toc(p)
        if not toc and not force_run:
            print u"목차를 찾을 수 없습니다. 작업을 중지합니다."
            break
        result += u"== [[%s|%s]] ==\n"%(p, section%(year, index))
        result += u"<div style='font-size: smaller;'>\n"
        for href, title in toc:
            result += "* [[%s|<nowiki>%s</nowiki>]]\n"%(href, title)
        result += u"</div>\n\n"

    if xxx:
        file('test.txt', 'wb').write(result.encode('utf-8'))
        return
    wikipedia.Page(site, archive_page).put(result, u"로봇: 자동 보존 문서")
    
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()