Link Extractor as the name suggests, scrapes all the URLs from a particular webpage. In the following code the extractor can extract relative as well as absolute URL's from a HTML webpage, and outputs them is a more readable and useful format. The Code Code: from BeautifulSoup import BeautifulSoup import urllib2 from urlparse import urljoin # to support relative urls import sys import re def checkUrl(url) : # django regex for url validation regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) if re.search(regex, url) == None : return 0 else : return 1 urls = [] if len(sys.argv) != 2: print "Usage :-" print ">> python %s URL" % sys.argv[0] exit() current_url = sys.argv[1] prefixes = ('http://', 'ftp://', 'https://') try : req = urllib2.Request(current_url) req.add_header('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') html_page = urllib2.urlopen(req) except urllib2.URLError as reason : print("URLError : %s" % (reason,)) exit() except ValueError : print("Invalid URL : %s" % sys.argv[1]) exit() except KeyboardInterrupt : print "Ctrl + C Detected, Exiting" exit() # We can have a ton of issues here which we simply dont care about source = html_page.read() soup = BeautifulSoup(source) anchor_tags = soup('a', limit = 100) # find at max 100 anchor tags from a webpage for tag in anchor_tags : try: url = tag['href'] except KeyError: continue if url.startswith('#') : # Mostly points to the same page continue if url.startswith(prefixes) == False : # relative link, we'll get a ton of invalid links here , example href='javascript:' etc. url = urljoin(current_url, url); if checkUrl(current_url) == 0 : continue if url not in urls : urls.append(url) print urls Note: The above code uses Django's URL checking regex for validating urls. Usage: Code: >> python link_extractor.py URL Example: Code: python link_extractor.py http://google.com Output Format: Code: [URL1, URL2, URL3 ...] Sample Usage: Code: >> python link_extractor.py http://google.com [u'http://www.google.co.in/webhp?hl=en&tab=ww', u'http://www.google.co.in/imghp?hl=en&tab=wi', u'http://maps.google.co.in/maps?hl=en&tab=wl', u'http://news.google.co.in/nwshp?hl=en&tab=wn', u'http://www.orkut.com/Main?tab=w0#Home', u'http://translate.google.co.in/?hl=en&tab=wT', u'https://mail.google.com/mail/?tab=wm', u'http://www.google.co.in/intl/en/options/', u'http://books.google.co.in/bkshp?hl=en&tab=wp', u'http://scholar.google.co.in/schhp?hl=en&tab=ws', u'http://www.google.co.in/blogsearch?hl=en&tab=wb', u'http://www.youtube.com/?tab=w1&gl=IN', u'https://www.google.com/calendar?tab=wc', u'http://picasaweb.google.co.in/home?hl=en&tab=wq', u'https://docs.google.com/?tab=wo', u'https://sites.google.com/?tab=w3', u'http://groups.google.co.in/grphp?hl=en&tab=wg', u'http://www.google.co.in/reader/?hl=en&tab=wy', u'http://google.com/url?sa=p&pref=ig&pval=3&q=http://www.google.co.in/ig%3Fhl%3Den%26source%3Diglk&usg=AFQjCNHe9Dv_h1zYL1VKlS7wKo_0Kg38dg', u'http://www.google.co.in/history/optout?hl=en', u'https://accounts.google.com/ServiceLogin?hl=en&continue=http://www.google.co.in/', u'http://google.com/preferences?hl=en', u'http://google.com/chrome/index.html?hl=en&brand=CHNG&utm_source=en-hpp&utm_medium=hpp&utm_campaign=en', u'http://google.com/advanced_search?hl=en', u'http://google.com/language_tools?hl=en', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=hi', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=bn', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=te', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=mr', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=ta', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=gu', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=kn', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=ml', u'http://www.google.co.in/setprefs?sig=0_WOxfwaHAuUx0n-9xUqHSVCu73ZY=&hl=pa', u'http://google.com/intl/en/ads/', u'http://www.google.co.in/services/', u'http://google.com/intl/en/about.html', u'http://www.google.com/ncr', u'http://google.com/intl/en/privacy.html']
hi, dude can i use it to extract youtube download links and print again n again when someone refresh the webpage..