From 220045b92b6a6b89ea430180b4346a5a5b25dd9a Mon Sep 17 00:00:00 2001
From: DennisEckerskorn <denniseckerskorn@gmail.com>
Date: Thu, 31 Oct 2024 12:16:05 +0100
Subject: [PATCH] Updated Project, reads files and extracts links, but problems
 with threads

---
 Ejercicio04_WebScrapper/Main.py               |  69 ++++++++++++++++++
 Ejercicio04_WebScrapper/WebFileReader.py      |  32 +++++++-
 .../__pycache__/WebFileReader.cpython-312.pyc | Bin 0 -> 1846 bytes
 .../resources/input_html/1.html               |  11 +++
 .../resources/input_html/2.html               |  11 +++
 .../resources/input_html/index.html           |   9 +++
 .../resources/output/1_content.txt            |  11 +++
 .../resources/output/extracted_links.txt      |   7 ++
 .../resources/output/index_content.txt        |   9 +++
 9 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 Ejercicio04_WebScrapper/Main.py
 create mode 100644 Ejercicio04_WebScrapper/__pycache__/WebFileReader.cpython-312.pyc
 create mode 100644 Ejercicio04_WebScrapper/resources/input_html/1.html
 create mode 100644 Ejercicio04_WebScrapper/resources/input_html/2.html
 create mode 100644 Ejercicio04_WebScrapper/resources/input_html/index.html
 create mode 100644 Ejercicio04_WebScrapper/resources/output/1_content.txt
 create mode 100644 Ejercicio04_WebScrapper/resources/output/extracted_links.txt
 create mode 100644 Ejercicio04_WebScrapper/resources/output/index_content.txt

diff --git a/Ejercicio04_WebScrapper/Main.py b/Ejercicio04_WebScrapper/Main.py
new file mode 100644
index 0000000..037a9c9
--- /dev/null
+++ b/Ejercicio04_WebScrapper/Main.py
@@ -0,0 +1,69 @@
+import os
+import threading
+import queue
+import time
+from WebFileReader import WebFileReader
+
+print("Directorio actual:", os.getcwd())
+
+# Inicializa colas para la comunciación entre hilos
+data_queue = queue.Queue()
+link_queue = queue.Queue()
+
+# Instancia de WebFileReader
+reader = WebFileReader()
+
+# Hilo A: Lee el archivos HTML y los coloca en data_queue
+def hilo_a():
+    filenames = ["index.html", "1.html", "2.html"]
+    for filename in filenames:
+        print(f"[Hilo A] Leyendo archivo: {filename}")
+        content = reader.read_file(filename)
+        if content:
+            data_queue.put((filename, content))
+        time.sleep(1) #Simulacion del tiempo de espera
+
+# Hilo B: Extrae los enlaces del contenido HTML y los coloca en link_queue
+def hilo_b():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            print(f"[Hilo B] Extrayendo enlaces de: {filename}")
+            links = reader.extract_links(html_content)
+            for link in links:
+                link_queue.put(link)
+
+# Hilo C: Guarda el contenido de los archivos en archivos de texto en la carpeta resources
+def hilo_c():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            save_path = f"EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/{filename.replace('.html', '')}_content.txt"
+            with open(save_path, 'w', encoding='utf-8') as file:
+                file.write(html_content)
+                print(f"[Hilo C] Guardando contenido de {filename} en {save_path}")
+
+# Hilo D: Guarda los enlaces extraídos en un archivo de texto para análisis
+def hilo_d():
+    while True:
+        if not link_queue.empty():
+            link = link_queue.get()
+            with open("EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/extracted_links.txt", 'a', encoding='utf-8') as file:
+                file.write(link + "\n")
+            print(f"[Hilo D] Enlace guardado: {link}")
+
+thread_a = threading.Thread(target=hilo_a, daemon=True)
+thread_b = threading.Thread(target=hilo_b, daemon=True)
+thread_c = threading.Thread(target=hilo_c, daemon=True)
+thread_d = threading.Thread(target=hilo_d, daemon=True)
+
+thread_a.start()
+thread_b.start()
+thread_c.start()
+thread_d.start()
+
+try:
+    while True:
+        time.sleep(1)
+except KeyboardInterrupt:
+    print("Programa terminado")
\ No newline at end of file
diff --git a/Ejercicio04_WebScrapper/WebFileReader.py b/Ejercicio04_WebScrapper/WebFileReader.py
index 6d8f77c..8b26a30 100644
--- a/Ejercicio04_WebScrapper/WebFileReader.py
+++ b/Ejercicio04_WebScrapper/WebFileReader.py
@@ -1 +1,31 @@
-import mysql.connector
\ No newline at end of file
+import os
+from bs4 import BeautifulSoup
+
+class WebFileReader:
+    def __init__(self, base_path="EjerciciosConHilos/Ejercicio04_WebScrapper/resources/input_html"):
+        self.base_path = base_path
+
+    def read_file(self, filename):
+        """
+        Lee un archivo HTML y devuelve su contenido como texto.
+        """
+        filepath = os.path.join(self.base_path, filename)
+        try:
+            with open(filepath, 'r', encoding='utf-8') as file:
+                return file.read()
+        except FileNotFoundError:
+            print(f"[WebFileReader] Archivo no encontrado: {filepath}")
+            return None
+        except Exception as e:
+            print(f"[WebFileReader] Error al leer el archivo {filepath}: {str(e)}")
+            return None
+        
+    def extract_links(self, html_content):
+        """
+        Extrae todos los enlaces de un contenido HTML dado.
+        """
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        return links
+
+
diff --git a/Ejercicio04_WebScrapper/__pycache__/WebFileReader.cpython-312.pyc b/Ejercicio04_WebScrapper/__pycache__/WebFileReader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de9ef432ba909bef2f21b4541999e866a8603a30
GIT binary patch
literal 1846
zcmZWqO-x)>6h8O;&>1@XOv`VO<SCdsiGrF+1yM5;C?<t8Qe!&OygcS!;Ej*>ruV%8
zW*m|(Sg_Ct>0+7~6Y9d&z@kgyj<p*MHR4lOx-oHqVA6;i&wXzOn0h92=G^ml&%Nin
zXTGbes|BP#Q(d!P%K*QLMTL~IaP%??C7?iw1u5|>S!7;HNdzRg22@@Fs%%Q3cS_Ea
zg!+eAOeBb3H^hvbXU^oTNhg<0hdv=M2f&er>k=?X5eg}Zs+6pdluV@+NGT$DO3f>Y
z8o%yCHa%)u?0sfX#?zH>kxa-~Jse#?R)Qiaf{s#1c}NgZN_5(AnVvPgxrF2^F0*F%
zIrMNvu%;cS(gRZ;xQx3~!_2l#cj`von{(_N+{v)C=T41ea8omFI__1+9y2ZHjOx|P
zIu<vX=0-Nl_*7M)J=we;(RI@{JzX!fRNE?gRTL6Mb{F>BUszQ>t3Sd8R2{Mg6~Q5o
zQ7C~Y90JOmy^4xXWPFB-kP$hap(3fYZdr<gCyEHDC{aSCEqPPyiS!06%SAbJK3tvU
zo8?7XqKf!b@)4MXE_e-GvWQT*1q45NWl6pTi=+#@SS3d{@Hhdlp|YlGQThVDlr~AI
z5wUq?5yHqsP2s&-txVS$({i?E@bsLy;Amqv$FFO7jk1Ltvlf`<=Crh9d(1YegVMaC
zd2GpZdQRef!V7B7n|b9;1W1J0X@{Ej>_p+xM^$~gttHDl*p4Q=Y>yk%8PF0DUvgYu
z5vcN&jAPmXrhLW8GTT=;Le-B8&`vnssFSnl2<Hy>)hst{FQM=Vp3slXpo4A9vrr0i
z37sompg1WPUGbwLoh}T8)~fP)P7ETSn?ZIL{*FLXOX<q`mD1q);D++B{z3iY$lk^N
z)yRRQM%#XgH?PfY$d6`!i1%%G@3(f8^6U9~pFEXGUDscL?8kWDZoKc`-y5_;kkCIC
z-)(wpdu)4P_rmCt*tMT7wjQXcKX!X?yRG`92H!R&o8+C^D2h8xYO?d(PP;7R&VXyJ
z$pLw%pCGS-wE$`WYa0cGb-C$)ub7^JW>LzNfrxG=L?tS3od9kaKF8d%_-t<D%NB5@
zYCHO_OP@v+SXNM@Orn+GcUfKpgwn_f6h@Y?r%dx4>bM%dZtSyVq?wCo5)eDZQ2;WE
z?LUQ4;ryI8Z}nsijswIe#!W<%GRN5re-5`#NL-XIx@%M2u&jg{Ha;ez^f0rBBMB!Y
z?+~7<W!fLRuDBY}{&N585WCz+dwO6kbW~%Y%GiXF?hvv72aTQgryjk%*U-PJ?03KP
zaQwme<F4)DjqxYl$-Ty*@4J)t@2thw`tG&wb|+WW-NvE)hNjinKZi2JuaW?SU&UTm
zaE2D%{r?o~JyBIp!CubrKpbzd*R-=aPZx*9F^;oPU|rY!T3w%aXwDLHOxN$^3@fy7
z@t?#+=lJu;z5y<#PVfl0xRZd1x8cT-t-z5))ULytTD9${twwDQy@eybFfmnpdD^`k
t^dJm>5p}02k0XO&8Sn5*Xb|{wn~@#Jgpi*h`YUukRa!{%As`FF{sVBVzsLXp

literal 0
HcmV?d00001

diff --git a/Ejercicio04_WebScrapper/resources/input_html/1.html b/Ejercicio04_WebScrapper/resources/input_html/1.html
new file mode 100644
index 0000000..a818cc2
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/input_html/1.html
@@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 1</title>
+    </head>
+    <BODY>
+        <a href="2.html">Ir a página 2</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
\ No newline at end of file
diff --git a/Ejercicio04_WebScrapper/resources/input_html/2.html b/Ejercicio04_WebScrapper/resources/input_html/2.html
new file mode 100644
index 0000000..0635933
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/input_html/2.html
@@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 2</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
\ No newline at end of file
diff --git a/Ejercicio04_WebScrapper/resources/input_html/index.html b/Ejercicio04_WebScrapper/resources/input_html/index.html
new file mode 100644
index 0000000..cf1c6dd
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/input_html/index.html
@@ -0,0 +1,9 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Inicio</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+    </BODY>
+</html>
\ No newline at end of file
diff --git a/Ejercicio04_WebScrapper/resources/output/1_content.txt b/Ejercicio04_WebScrapper/resources/output/1_content.txt
new file mode 100644
index 0000000..a818cc2
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/output/1_content.txt
@@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 1</title>
+    </head>
+    <BODY>
+        <a href="2.html">Ir a página 2</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
\ No newline at end of file
diff --git a/Ejercicio04_WebScrapper/resources/output/extracted_links.txt b/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
new file mode 100644
index 0000000..0ca3f30
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
@@ -0,0 +1,7 @@
+2.html
+index.html
+1.html
+index.html
+1.html
+1.html
+index.html
diff --git a/Ejercicio04_WebScrapper/resources/output/index_content.txt b/Ejercicio04_WebScrapper/resources/output/index_content.txt
new file mode 100644
index 0000000..cf1c6dd
--- /dev/null
+++ b/Ejercicio04_WebScrapper/resources/output/index_content.txt
@@ -0,0 +1,9 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Inicio</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+    </BODY>
+</html>
\ No newline at end of file