#!/usr/bin/env python """ A small script to detect whether there are source code links from the PDFs of conference papers """ import tarfile import os input_file = tarfile.open('nips2008_pdf.tgz','r') files_with_link = [] for filename in input_file.getnames(): if filename.endswith('.pdf') and not filename.endswith('_slide.pdf'): input_file.extract(filename,'.') os.system('pdftotext ' + filename) text_file = open(filename[:-4]+'.txt', 'r') found_link = False for text in text_file.readlines(): # Find links, but filter out PDF generation stuff if (text.find('http://')>0 or text.find('ftp://')>0)\ and not (text.find('ns.adobe.com')>0 or text.find('w3.org')>0 or text.find('purl.org')>0 or text.find('cairographics.org')>0 or text.find('apple.com')>0 or text.find('iec.ch')>0): print filename, text found_link = True if found_link: files_with_link.append(filename) text_file.close() os.system('rm ' + filename + ' ' + filename[:-4]+'.txt') print files_with_link