Ñò
ÝT~Kc           @   sf  d  Z  d d k Z d d k Z d d k Z d d k Z d d k Z d d k l Z d d k Z d d k	 Z	 d d k
 Z
 d d k Z d d k Z d d k Z d d k Z d d k l Z y d d k l Z Wn d d k l Z n Xe i d ƒ Z e i d ƒ Z e i d	 ƒ Z e i d
 ƒ Z e a d „  Z d d „ Z d „  Z d „  Z d „  Z e  d „ Z! d S(   s†   
Fetch either a single feed, or a set of feeds, normalize to Atom and XHTML,
and write each as a set of entries in a cache directory.
iÿÿÿÿN(   t   minidom(   t   StringIO(   t   md5(   t   news   ^\w+:/*(\w+:|www\.)?s   [?/:|]+s   ^[,.]*s   [,.]*$c         C   s}  yS t  i | ƒ o? t | t ƒ o | i d ƒ i d ƒ } qR | i d ƒ } n Wn n Xt | t ƒ o | i d ƒ } n t  i d | ƒ } t i d | ƒ } t	 i d | ƒ } t
 i d | ƒ } t | ƒ d j o | i d ƒ } x} t t | ƒ d d ƒ D]_ } t d i | |  ƒ ƒ d j  o9 d i | |  ƒ d t d i | | ƒ ƒ i ƒ  } PqqWn t i i |  | ƒ S(	   s•   Return a filename suitable for the cache.

    Strips dangerous and common characters to create a filename we
    can use to store the cache in.
    s   utf-8t   idnat    t   ,iú   i    iÿÿÿÿiÜ   (   t   re_url_schemet   matcht
   isinstancet   strt   decodet   encodet   unicodet   subt   re_slasht   re_initial_cruftt   re_final_cruftt   lent   splitt   ranget   joinR   t	   hexdigestt   ost   path(   t	   directoryt   filenamet   partst   i(    (    s   /net/bzr/venus/planet/spider.pyR      s,       c         C   sK   t  | d ƒ } | i |  ƒ | i ƒ  | o t i | | | f ƒ n d S(   s     write the document out to disk t   wN(   t   opent   writet   closeR   t   utime(   t   xdoct   outt   mtimet   file(    (    s   /net/bzr/venus/planet/spider.pyR   :   s
    
 c         C   s   t  i  |  ƒ } | d d j S(   Ni    t   httpt   https(   s   https   https(   t   urlparse(   t   urit   parsed(    (    s   /net/bzr/venus/planet/spider.pyt   _is_http_uriA   s    c         C   s>  t  i } t i ƒ  } | i d ƒ ps | i d ƒ o# t | i ƒ d j o d | _ q˜ | i o) | i	 i
 i i ƒ  d j o d | _ q˜ d | _ n t i t i ƒ  d t i |  ƒ ƒ } | i d j o¥ | i d	 ƒ o• | i | i d
 <| i d ƒ o7 t | i ƒ d j o! | i d |  ƒ d | i d <qo|  | i j o | i d |  ƒ qo| i d |  | i ƒ ný| i d j oP | i d ƒ o@ t | i ƒ d j o* | i d |  | i ƒ | i | i d
 <n| i d j o
| i d	 ƒ oú | i | i d
 <|  | i j o | i d |  ƒ n | i d |  | i ƒ | i i d ƒ pB | i i d ƒ o+ | i i } t i | ƒ | j o d  Sqéqo| i i i d ƒ o d  S| i i i d ƒ o% | i i i d ƒ o | i d =qonƒ | i d j o | i d |  ƒ n_ | i d j o | i d |  ƒ n; | i d j o | i d | i |  ƒ n | i d |  ƒ | i oM | i oC | i | _ | i i d d ƒ d j | _ | i i d ƒ | _ n t | i ƒ | i d  <| i d! ƒ oø | i d" ƒ o | i o | i | i d# <n: | i i d" ƒ o& | i d" o | i d" | i d# <n | i i d$ ƒ o | i d$ | i d% <n8 | i d& ƒ o' | i o t i  | i ƒ | i d% <n | i i d' ƒ o | i d' | i d( <qån | i oÏ | i i d) ƒ p t! ƒ  | i d) <n d* } | i i d+ ƒ o
 d, } n | i dM j o
 d/ } n xh | i i" D]% } | i# d0 j o | | d1 <Pq`q`W| i i" i$ t i% h d0 d2 6| d1 6|  d3 6ƒ ƒ n x4 t i& |  ƒ i' ƒ  D] \ }	 }
 |
 | i d4 |	 <qÔWt( i( |  | ƒ d5 d6 k  l) } t* d  j o | i, ƒ  a* n h  } xÐ | i D]Å } | i d7 ƒ p | i- o, t. i- d  | ƒ | d7 <| d7 p qBqn d8 } | i d9 ƒ o | i/ } n | i d: ƒ o | i0 } n | | i | i- dN ƒ d j o | | f | | i- <qBqBWt i1 ƒ  } xY| i2 ƒ  D]K\ } } t3 | | i- ƒ } d  } | i d; ƒ p | d; o | i d< d  ƒ | d; <n | i d; ƒ o$ y t4 i5 | i6 ƒ } Wq³q³Xn | p^ y t7 i8 | ƒ i9 } Wq| i i d; ƒ o' y t4 i5 | i i6 ƒ } WqqXqqXn | p t i ƒ  } n t i | ƒ | d; <t. i. | | ƒ } | i: ƒ  i; d= ƒ } | i< ƒ  x; t i= |  ƒ D]* } t> i? | | d> d? ƒ} | p PqƒqƒW| p+ t7 i@ iA | ƒ o t7 iB | ƒ q$q$n tC | | | ƒ t* d  j oo | i i d7 | i i d@ d  ƒ ƒ } | o@ tD | ƒ tE j o | i; d= ƒ } n | t* t3 d8 | i- ƒ <qo	q$q$Wt* o t* iF ƒ  n t i |  ƒ oê g  } | i D]$ } | i d; ƒ o | | i6 q¦	q¦	~ } | iG ƒ  | o! t iH dA | d5 ƒ | i d <n0 | i i d ƒ o t i | i i ƒ g } n | p | d5 | j  o1 dB t i |  ƒ } | i | ƒ | | i d <q‚
n | i dC j oP | i i d ƒ o | i d =n | i i d ƒ o | i d | i d <q¯nÎ | i dD j o dE | i d <n­ | i dF j o dG | i d <nŒ | i d j o dH | i d <nk | i d j o dI | i d <nJ | i d j o dJ | i d <n) | i d j o dK | i | i d <n t7 i@ iA | ƒ p t7 iI | ƒ n tJ iK dL t  iL ƒ } t. iM | iN | i | i | i ƒ tC | i: ƒ  i; d= ƒ t3 | |  ƒ ƒ | i< ƒ  d  S(O   Nt   statust   entriesi    iÈ   t   timeouti˜  iô  i€Q t   urlt   planet_http_locations
   No data %ss   no datat   planet_messages   Updating feed %ss   Updating feed %s @ %si-  s    Feed has moved from <%s> to <%s>i0  s   Feed %s unchangeds   Feed %s unchanged @ %st   planet_updateds   no activity int	   duplicateiš  s   Feed %s gones   Feed %s timed outi  s   Error %d while updating feed %st   planet_bozot   truet   planet_formatt   planet_http_statust   headerst   etagt   planet_http_etags   last-modifiedt   planet_http_last_modifiedt   modifieds   -content-hasht   planet_content_hasht   linkss   application/atom+xmlt   rsss   application/rss+xmlt   rss090t   rss10s   application/rdf+xmlt   selft   typet   relt   hreft   planet_iÿÿÿÿ(   t   idindext   idR   t	   publishedt   updatedt   updated_parsedt   published_parseds   utf-8t   modet   filtert   links   %Y-%m-%dT%H:%M:%SZs   no activity in %d daysiâ   i“  s   403: forbiddeni”  s   404: not founds   408: request timeouts	   410: gones   internal server errors   http status %ssD   <feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>
(   s   rss090s   rss10(   R   (O   t   planett   loggert   configt   cache_sources_directoryt   has_keyR   R-   R,   t   bozot   bozo_exceptiont	   __class__t   __name__t   lowert   timet   gmtimet   activity_thresholdR/   t   feedt   warningt   infoR2   t
   feedparsert   _parse_date_iso8601R1   t
   startswitht   errort   versiont   getR
   R9   R8   R<   t   asctimet   listR>   RD   t   appendt   FeedParserDictt   feed_optionst   itemst   scrubRG   t   indext   NoneR   RH   t   reconstituteRI   RJ   t   cache_directoryt   valuesR   t   calendart   timegmRK   R   t   statt   st_mtimet   toxmlR   t   unlinkt   filterst   shellt   runR   t   existst   removeR   RC   R   R    t   sortt   strftimet   makedirsR    t   parseStringt   xmlnst   sourcet   documentElement(   t   feed_urit	   feed_infot   datat   logt   sourcest   activity_horizonRJ   t   feedtypeRO   t   namet   valueRG   t   idst   entryt   cachet
   cache_fileR$   R"   t   outputRN   t   feedidt   _[1]t   msg(    (    s   /net/bzr/venus/planet/spider.pyt
   writeCacheE   s`   	&
& &6 !
  
 
 
	#  
       
  	 $ " '
!  "%c         C   s  d d  k  } d d k l } | i t i ƒ  ƒ } | i d t ƒ \ } } x±| o©| i d | |  ƒ t	 d ƒ }	 t
 |	 d | ƒ t
 |	 d t i h d d	 6ƒ ƒ yÎyc t | t ƒ o | i d
 ƒ }
 n | i d ƒ i d
 ƒ }
 |
 | j o | i d | |
 ƒ n Wn | i d | ƒ | }
 n Xh  } | i i d ƒ o | i d | d <n | i i d ƒ o | i d | d <n | i |
 d d | ƒ\ } } t | p d ƒ i ƒ  | d <| i d j oS | i o d | _ q| i i d ƒ o% | i d | d j o d | _ qn t	 | ƒ }	 t
 |	 d | i d | ƒ ƒ | i d ƒ o | d =n t
 |	 d | ƒ WnN| j
 o | i d | |  ƒ n*| i j
 o" } | i d t | ƒ |  ƒ nú t i j
 o\ } | i i i ƒ  d j o! d |	 i d	 <| i  d |  ƒ qÁ| i d t | ƒ |  ƒ n t! j
 oƒ } d d  k" } d d  k# } | i$ ƒ  \ } } } | i d | ƒ x? | i% | | ƒ | i& | ƒ D] } | i | i' ƒ  ƒ qŸWn X| i( d t d  | | |	 f ƒ | i d t ƒ \ } } qL Wd  S(!   Niÿÿÿÿ(   t   BadStatusLinet   blocks   Fetching %s via %dR   R/   R8   t   500R,   R   s   utf-8s   IRI %s mapped to %ss   unable to map %s to a URIR:   s   If-None-MatchR;   s   If-Modified-Sincet   GETs   -content-hashiÈ   i0  R=   s   content-locations   content-encodings&   Bad Status Line received for %s via %ds   HttpLib2Error: %s via %dR.   t   408s   Timeout in thread-%ds   HTTP Error: %s in thread-%ds   Error processing %st   item()   t   httplib2t   httplibR–   t   HttpRR   t   http_cache_directoryRe   t   TrueR_   R   t   setattrR`   Ri   R	   R   R   R   R]   RT   t   requestR   R   R,   t	   fromcacheRc   t   HttpLib2ErrorR
   t   socketRW   RX   RY   R8   t   warnt	   Exceptiont   syst	   tracebackt   exc_infot   format_exception_onlyt	   format_tbt   rstript   put(   t   thread_indext   input_queuet   output_queueR‡   Rœ   R–   t   hR)   R…   R]   R   R8   t   respt   contentt   eR¨   R©   RC   RŒ   t   tbt   line(    (    s   /net/bzr/venus/planet/spider.pyt
   httpThread  sx     	 


	 c         C   s  t  i } t a t i ƒ  } y' t i t | ƒ ƒ | i	 d | ƒ WnT y3 d d k
 } | i t | ƒ ƒ | i	 d | ƒ Wq™ | i d | ƒ q™ Xn Xd d k l } d d k l } | ƒ  } | ƒ  } h  } t i ƒ  }	 |	 o% t i i |	 ƒ o t i |	 ƒ n t t i ƒ  ƒ oZ xd t t t i ƒ  ƒ ƒ D]9 }
 | d t d |
 | | | f ƒ | |
 <| |
 i ƒ  q5Wn | i	 d	 ƒ xà t i ƒ  D]Ò } t i ƒ  } t | | ƒ } t i | ƒ } | i o |  o | i	 d
 | ƒ qn | i i  d d ƒ d j o | i	 d | ƒ qn | o' t" | ƒ o | i# d | | f ƒ q| i# d | | | f ƒ qWx$ | i$ ƒ  D] } | i# d d# ƒ qsWh  } x{| i% ƒ  p | i% ƒ  p | oYx, | i% ƒ  d j o | o t& i' d ƒ qºWxØ| i% ƒ  oÊ| i  t( ƒ \ } } } yt) | d ƒ p t | i* i+ ƒ d j  ou h  } t) | d ƒ oI | i i  d d ƒ | d <y" t& i, | i i  d d ƒ ƒ } WqšqšXn t i | |  } nS t i- h d d 6| i* d 6g  d 6h  d 6| i. d 6d d 6t | i* i+ ƒ d 6ƒ } | i i  d d ƒ } | p | i i  d d ƒ } n | } | i/ d ƒ o | i0 } n d } | o | | j o
 | } n | o | | j o
 | } n | oL d | | | i d <| i1 d | | | f ƒ | o | | i d  <qïn | o | | | <n | o | | | <n t2 | | | ƒ Wqét3 j
 oƒ } d d k4 } d d k5 } | i6 ƒ  \ } } } | i7 d! | ƒ x? | i8 | | ƒ | i9 | ƒ D] } | i7 | i: ƒ  ƒ qšWqéXqéWxH | i$ ƒ  D]: a | t i; ƒ  p# | t =| p | i	 d" ƒ qqÎqÎWq–Wd S($   s!    Spider (fetch) an entire planet s    Socket timeout set to %d secondsiÿÿÿÿNs+   Timeout set to invalid value '%s', skipping(   t   Queue(   t   Threadt   targett   argss   Building work queues   Feed %s already in cacheR7   t   410s   Feed %s goneR›   i    gš™™™™™¹?R8   i,  R]   R:   R9   R;   Rd   R-   RE   RU   R,   RH   s   duplicate subscription: R1   s!   Duplicate subscription: %s and %sR0   s   Error processing %ss%   Finished threaded part of processing.(   NN(<   RP   RQ   R    Rm   RR   t   feed_timeoutR¥   t   setdefaulttimeoutt   floatR_   t   timeoutsockett   setDefaultSocketTimeoutR^   R¹   t	   threadingRº   RŸ   R   R   R{   R   t   intt   spider_threadsR   R¸   t   startt   subscriptionsRS   R   R`   t   parseR]   Re   Rn   R+   R®   t   keyst   qsizeRZ   t   sleept   Falset   hasattrR8   R,   t   strptimeRi   R/   RT   RE   R¦   R•   R§   R¨   R©   Rª   Rc   R«   R¬   R­   t   isAlive(   t   only_if_newR‡   R.   RÁ   R¹   Rº   t   fetch_queuet   parse_queuet   threadst
   http_cacheR   R)   Rˆ   t   feed_sourceR…   t   threadt
   feeds_seenR]   t   optionsR<   R†   RH   RE   R3   Rµ   R¨   R©   RC   RŒ   R¶   R·   (    (    s   /net/bzr/venus/planet/spider.pyt   spiderPlanetd  sÔ    			    !  *  

	    ! ("   t   __doc__RZ   Rr   t   reR   R(   t   xml.domR    RP   RR   R`   Ro   Ry   R¥   Rl   R   t   hashlibR   R   t   compileR   R   R   R   R    Rm   R   Rn   R   R+   R•   R¸   RÌ   RÙ   (    (    (    s   /net/bzr/venus/planet/spider.pyt   <module>   s&   <T	 		Ö	I
