#-*- coding: utf-8 -*- # Python 2.7 # may require up to 32GB of RAM to process English Wikipedia'2015 from lxml import etree from sets import Set import sys, time, re, os, networkx, copy def wikipedia_main_namespace(title): if title.startswith ("Wikipedia:"): return False if title.startswith ("Wikipedia talk:"): return False if title.startswith ("User:"): return False if title.startswith ("User talk:"): return False if title.startswith ("Category:"): return False if title.startswith ("Category talk:"): return False if title.startswith ("Talk:"): return False if title.startswith ("File:"): return False if title.startswith ("File talk:"): return False if title.startswith ("Template:"): return False if title.startswith ("Template talk:"): return False if title.startswith ("Portal:"): return False if title.startswith ("Special:"): return False if title.startswith (":"): return False return True def capitalize(line): if len(line)==0: return line if len(line)==1: return line.upper() return line[0].upper() + line[1:] def process_redirects (fname): redirects={} with open(fname,'r') as file: for line in file: m = re.search('REDIRECT \| ([^\|]*) \| ([^\|\n]*)', line) if m!=None: rfrom=m.group(1) rto=m.group(2) if wikipedia_main_namespace(rfrom) and wikipedia_main_namespace(rto): redirects[capitalize(rfrom)]=capitalize(rto) else: #print "line unparsed",line pass return redirects def process_links (fname, redirects): links={} lines_read=0 with open(fname,'r') as file: for line in file: lines_read=lines_read+1 if (lines_read % 100000)==0: print "process_links(), lines_read=", lines_read m = re.search('LINK \| ([^\|]*) \| ([^\|\n]*)', line) if m!=None: lfrom=m.group(1) lto=m.group(2) if lfrom.startswith('{{'): # skip templates continue lfrom_cap=capitalize(lfrom) if wikipedia_main_namespace(lfrom): if lfrom_cap not in links: links[lfrom_cap]=Set() # fix redirects if lto in redirects: lto=redirects[lto] if wikipedia_main_namespace(lto): links[lfrom_cap].add(capitalize(lto)) else: #print "line unparsed",line pass return links # command line: redirects, links, pickle def mainmain(): redirects=process_redirects(sys.argv[1]) print "redirects", len(redirects) sys.stderr.write(sys.argv[1]+" processed\n") links=process_links(sys.argv[2], redirects) links_t=len(links) print "links", links_t sys.stderr.write(sys.argv[2]+" processed\n") G=networkx.Graph() articles_processed=0 for article in links: articles_processed=articles_processed+1 if (articles_processed % 100000)==0: sys.stdout.write('links processed=' + str(articles_processed) + '/' + str(links_t) + ' (%'+str(articles_processed*100/links_t)+')\n') if len(links[article])<6: continue G.add_node(article) for l in links[article]: if (l in links) and (article in links[l]): # back link is also present G.add_node(l) G.add_edge(article, l) networkx.write_gpickle(G,sys.argv[3]) mainmain()