محب علوی
لائبریرین
اس دھاگے میں ہم ایک سادہ سا web crawler بنائیں گے جس کا کوڈ اور ویڈیوز Udacity کے کورس میں تفصیلا دیکھی جا سکتی ہیں۔
 
Introduction to Computer Science
			
			Introduction to Computer Science
import urllib.request
 
def get_next_target(page):
    start_link = page.find('<a href=')      # start of link tag
    if start_link == -1:                    # if  link tage not found then return None and 0
        return None,0
    start_quote = page.find('"',start_link)  # index of first quote in link tag
    end_quote = page.find('"',start_quote+1) # index of second quote in link tag
    url = page[start_quote+1:end_quote]      # extracting string between start quote and end quote
 
    return url, end_quote                    # returing URL and the index of ending quote of link string
 
def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)  # calling function to get link and end position of lin i.e. second quote(")
        if url:                              # if url has some value(not empty hence true)
            links.append(url)                      # then append url in LINKS list
            page = page[endpos:]            # reassign page starting from the end of the found url and rest of the remaining page string
        else:
            break
    return links
 
#  opening a given url using URLLIB module
response = urllib.request.urlopen('http://www.urduweb.org')
 
# reading the file handle and converting it into a string using str() function
page = str(response.read())
 
# calling get_all_links function to further call get_next_target function and get a list of all links in a page
pagelist = get_all_links(page)
 
print(pagelist)
	start with tocrawl = [seed]
crawled = []
while there are more pages tocrawl:
    pick a page from tocrawl
    add that page to crawled
    add all the link targets on this page to tocrawl
return crawled
	def crawl2(seed):
  ct=0
  crawled=[]
  tocr=[]
  tocr.append(seed)
  while tocr:
      ct=ct+1
      if ct==50:      #goes to infinity otherwise
          break
      crawled.append(tocr[0])
      print (tocr[0])
      import urllib.request
      try:
          response = urllib.request.urlopen(tocr[0])
          page = str(response.read())
          pagelist = get_all_links(page)
          tocr.extend(pagelist)
      except:
          print("an error here")
      del tocr[0]
  print(crawled)
  return crawled
	ری یوزیبلٹی کا فائدہ اٹھاتے ہوئے:
PHP:def crawl2(seed): ct=0 crawled=[] tocr=[] tocr.append(seed) while tocr: ct=ct+1 if ct==50: #goes to infinity otherwise break crawled.append(tocr[0]) print (tocr[0]) import urllib.request try: response = urllib.request.urlopen(tocr[0]) page = str(response.read()) pagelist = get_all_links(page) tocr.extend(pagelist) except: print("an error here") del tocr[0] print(crawled) return crawled
crawl2('http://www.udacity.com/cs101x/index.html')
	http://www.udacity.com/cs101x/index.html
an error here
['http://www.udacity.com/cs101x/index.html']
	ری یوزیبلٹی کا فائدہ اٹھاتے ہوئے:
PHP:def crawl2(seed): ct=0 crawled=[] tocr=[] tocr.append(seed) while tocr: ct=ct+1 if ct==50: #goes to infinity otherwise break crawled.append(tocr[0]) print (tocr[0]) import urllib.request try: response = urllib.request.urlopen(tocr[0]) page = str(response.read()) pagelist = get_all_links(page) tocr.extend(pagelist) except: print("an error here") del tocr[0] print(crawled) return crawled
ct=ct+1
      if ct==50:      #goes to infinity otherwise
          break
	  import urllib.request
      try:
          response = urllib.request.urlopen(tocr[0])
          page = str(response.read())
          pagelist = get_all_links(page)
          tocr.extend(pagelist)
      except:
          print("an error here")
	def get_page(url):
    try:
        import urllib
        return urllib.urlopen(url).read()
    except:
        return ""
	get_all_links(get_page(page)
	ویسے میرے پاس کام کر رہا ہے ابھی بھی:کوڈ کام نہیں کر رہا نمرہ
میں نے سیڈ کے طور پر مندرجہ بالا لنک دیا
کوڈ:crawl2('http://www.udacity.com/cs101x/index.html')
جواب میں یہ ایرر آ رہی ہے۔
کوڈ:http://www.udacity.com/cs101x/index.html an error here ['http://www.udacity.com/cs101x/index.html']
crawl2('http://www.udacity.com/cs101x/index.html')
	http://www.udacity.com/cs101x/index.html
http://www.udacity.com/cs101x/crawling.html
http://www.udacity.com/cs101x/walking.html
http://www.udacity.com/cs101x/flying.html
http://www.udacity.com/cs101x/kicking.html
http://www.udacity.com/cs101x/index.html
http://www.udacity.com/cs101x/crawling.html
http://www.udacity.com/cs101x/walking.html
http://www.udacity.com/cs101x/flying.html
http://www.udacity.com/cs101x/kicking.html
	یہ ضرور ہے کہ یہ کوڈ رکتا نہیں ہے، چلتا رہتا ہے۔
ایک سوال ہے ،اگر اسے وائل لوپ کے بجائے recursion سے کیا جائے تو کیا اس کی رفتار کم ہو جائے گی؟