import sys, optparse, errno, re, codecs, os def imdbref(argv): """ create a .gdl-file from imdb's movie-links.list """ parser = optparse.OptionParser( usage='Usage: %prog [options]', description="create a .gdl-file from imdb's movie-links.list", version="%prog 0.1" ) parser.add_option( "-f", "--file", default="movie-links.list", help="file to process [default: %default]") (options, args) = parser.parse_args(argv[1:]) if options.file == None: parser.print_help() sys.exit(-1) else: names = [] lines = [] connections = [] # generate {movie list file}.utf8 os.system("iconv -f iso-8859-1 -t UTF-8 "+options.file+" > "+options.file+".utf8") f = codecs.open( options.file +'.utf8', encoding='utf-8' ) for line in f: line = line.encode('utf-8', 'replace') if line[0] == "\"": if re.search("{", line): # Skip TV-episodes pass elif re.search("\(TV\)", line): # Skip (TV) pass else: #title = repr(line) title = line elif re.search("referenced in", line): if re.search("{", line): # Skip TV-episodes pass elif re.search("\(TV\)", line): # Skip (TV) pass elif re.search("\(V\)", line): # Skip (V) pass elif re.search("\(VG\)", line): # Skip (VG) pass elif re.search("lgyi-show", line): pass else: title = title.replace("\"", "") title = title.replace("\"", "") refer = line.replace("referenced in", "") refer = refer.replace("(", "", 1) refer = refer.replace("))", ")") refer = refer.replace("\"", "") # Titles and referers into the names-list title = title.strip() refer = refer.strip() if title not in names: names.append(title) if refer not in names: names.append(refer) names = sort2(names) # remove duplicates, the hard way # Let's get the title and refer index number from the names-list id_title = names.index(title) id_refer = names.index(refer) # We use the already defined names list to make the file smaller normal_line = '\tedge: { sourcename: "'+str(id_title)+'" targetname: "'+str(id_refer)+'" }' revers_line = '\tedge: { sourcename: "'+str(id_refer)+'" targetname: "'+str(id_title)+'" }' if revers_line or normal_line not in lines: # no dublicates if id_title is not id_refer and normal_line is not revers_line: if id_title != 6025 and id_refer != 6025: # We get occurances connections.append( id_title ) connections.append( id_refer ) # Add the line itself lines.append(normal_line) else: pass else: pass os.remove( options.file +'.utf8' ) # We get rid of the temp file used = [] if True: print ''' graph: { title : "IMDB references" layoutalgorithm : tree scaling : 0.5 colorentry 42 : 152 222 255 node.shape : ellipse node.color : 42 node.height : 32 edge.color : blue edge.arrowsize : 6 node.textcolor : black splines : yes ''' # loop the names for i, name in enumerate(names): c = connections.count(i) if c > 2: print ' node: {title: "' + str(i)+'" label: "'+str(name)+'" }' used.append( i ) print # loop the connections for i, line in enumerate(lines): s = re.findall("([0-9]+)", line) #if any(x in s for x in used): # wtf? # print "//", s, 'not found in used' if s[0] in used: if s[1] in used: print line #else: # print line print "}" def sort2(seq): # Dave Kirby seen = set() return [x for x in seq if x not in seen and not seen.add(x)] if __name__ == '__main__': imdbref(sys.argv)