
    {h!                         d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ erddlmZ ddlmZ dd	lmZ dd
lmZmZmZ  ej0                  e      Z G d de      Z G d de      Z G d de      Zy)zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptional)Document)
BaseLoader)Browser)Page)Response)r   r   r   c            	       T    e Zd ZdZedddddddefd	       Zedd
dddddefd       Zy)PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    pager   browserr   responser   returnc                      y)a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   s       m/var/www/html/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/url_playwright.pyevaluatezPlaywrightEvaluator.evaluate   s     	    	AsyncPageAsyncBrowserAsyncResponsec                    K   yw)a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   s       r   evaluate_asyncz"PlaywrightEvaluator.evaluate_async*   s      	s   N)__name__
__module____qualname____doc__r   strr   r!   r   r   r   r   r      sl     V i : RU   *8DS	 r   r   c                   Z    e Zd ZdZddeee      fdZddddd	d
defdZddddd	ddefdZ	y)UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 L    	 ddl }|| _        y# t        $ r t        d      w xY w)z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr)   )r   r)   r+   s      r   __init__z"UnstructuredHtmlEvaluator.__init__>   s7    	 !1  	- 	s    #r   r   r   r   r   r   r   c                 T   ddl m} | j                  xs g D ]J  }|j                  |      j	                         }|D ]$  }|j                         s|j                  d       & L |j                         } ||      }dj                  |D 	cg c]  }	t        |	       c}	      S c c}	w )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text


unstructured.partition.htmlr0   r)   locatorall
is_visibler   contentjoinr&   
r   r   r   r   r0   selectorelementselementpage_sourceels
             r   r   z"UnstructuredHtmlEvaluator.evaluateJ   s    >--3 	DH||H-113H# D%%'$$%BCD	D lln!{3{{h7CG7887s   B%r   r   r   c                   K   ddl m} | j                  xs g D ]b  }|j                  |      j	                          d{   }|D ]4  }|j                          d{   s|j                  d       d{    6 d |j                          d{   } ||      }dj                  |D 	cg c]  }	t        |	       c}	      S 7 7 m7 U7 ;c c}	w w)z4Asynchronously process the HTML content of the page.r   r/   Nr1   r2   r4   r5   r<   s
             r   r!   z(UnstructuredHtmlEvaluator.evaluate_asyncX   s      	?--3 	JH!\\(37799H# J ++---!**+HIIIJ	J !LLN*!{3{{h7CG788 :-I*7s]   <CCCC	C!C5C6CCC.C C	CCCC)N)
r"   r#   r$   r%   r
   r	   r&   r-   r   r!   r   r   r   r(   r(   ;   sa    J
1$s))< 
19V 9i 9: 9RU 999*89DS9	9r   r(   c                       e Zd ZdZ	 	 	 	 	 ddee   dededeee      dee   dee	eef      fd	Z
d
ee   fdZd
ee   fdZd
ee   fdZy)PlaywrightURLLoadera  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    Nurlscontinue_on_failureheadlessr)   	evaluatorproxyc                     	 ddl }|| _        || _        || _        || _        |r|rt        d      |xs t        |      | _        y# t        $ r t        d      w xY w)z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)	
playwrightr,   rE   rF   rG   rI   
ValueErrorr(   rH   )r   rE   rF   rG   r)   rH   rI   rK   s           r   r-   zPlaywrightURLLoader.__init__   sv    	 	#6  
	L 
 #Q&?@P&Q#  	+ 	s   A Ar   c           	   #   Z  K   ddl m}  |       5 }|j                  j                  | j                  | j
                        }| j                  D ]u  }	 |j                         }|j                  |      }|t        d|       |j                  d       | j                  j                  |||      }d|i}t        ||       w |j%                          ddd       y# t        $ r4}	| j                  rt         j#                  d	| d
|	        n|	Y d}	~	d}	~	ww xY w# 1 sw Y   yxY ww)zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightrG   rI   N"page.goto() returned None for url loadsourcepage_contentmetadataError fetching or processing , exception: )playwright.sync_apirN   chromiumlaunchrG   rI   rE   new_pagegotorL   wait_for_load_staterH   r   r   	ExceptionrF   loggererrorclose)
r   rN   pr   urlr   r   r3   rU   es
             r   	lazy_loadzPlaywrightURLLoader.lazy_load   s#     	8 	!jj''djj'QGyy   "++-D#yy~H'(+McU)STT,,V4>>224(KD (#H"xHH & MMO+	 	 !  //;C5aSQ  	 	 	sH   D+ADA2CD	D+	D(*DDDDD($D+c                 `   K   | j                         2 cg c3 d{   }|7 6 c}S c c}w w)Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        N)
alazy_load)r   docs     r   aloadzPlaywrightURLLoader.aload   s*      &*__%677c7777s$   .)%#
%)%).c           	       K   ddl m}  |       4 d{   }|j                  j                  | j                  | j
                         d{   }| j                  D ]  }	 |j                          d{   }|j                  |       d{   }|t        d|       |j                  d       d{    | j                  j                  |||       d{   }d|i}t        ||        |j%                          d{    ddd      d{    y7 7 7 7 7 q7 N# t        $ r5}	| j                  rt         j#                  d	| d
|	        n|	Y d}	~	d}	~	ww xY w7 c7 U# 1 d{  7  sw Y   yxY ww)rg   r   )async_playwrightNrO   rP   rQ   rR   rS   rV   rW   )playwright.async_apirl   rY   rZ   rG   rI   rE   r[   r\   rL   r]   rH   r!   r   r^   rF   r_   r`   ra   )
r   rl   rb   r   rc   r   r   r3   rU   rd   s
             r   rh   zPlaywrightURLLoader.alazy_load   sn     	:#% 	" 	"JJ--t}}DJJ-WWGyy   !(!1!1!33D%)YYs^3H'(+McU)STT226:::!%!>!>tWh!WWD (#H"xHH & --/!!+	" 	" 	"W 43 ;W !  //;C5aSQ  	  "+	" 	" 	" 	"s   FD F5E2D#E2"D-5D%6D-D'(D-7D)8$D-D+D-5E2
E.E2FE0F#E2%D-'D-)D-+D--	E+6*E& E2&E++E20F2F8E;9F F)TTNNN)r"   r#   r$   r%   r	   r&   boolr
   r   r   r-   r   r   re   rj   r   rh   r   r   r   rD   rD   i   s    : %)0437*.R3iR "R 	R
 #49-R /0R S#X'R>8H- >8T(^ 8"-"9 "r   rD   )r%   loggingabcr   r   typingr   r   r   r   r	   r
   langchain_core.documentsr   )langchain_community.document_loaders.baser   rm   r   r   r   r   r   r   rX   	getLoggerr"   r_   r   r(   rD   r   r   r   <module>ru      si    W  # O O - @<6>;; 
		8	$## #L+9 3 +9\@"* @"r   