Source code for graphrag_sdk.source

import re
import requests
from bs4 import BeautifulSoup
from abc import ABC, abstractmethod

[docs] def Source(source:str) ->'AbstractSource': """ Creates a source object Parameters: source (str): path to source Returns: AbstractSource: source """ if not isinstance(source, str) or source == "": raise Exception("Invalid argument, source should be a none empty string.") if ".pdf" in source.lower(): return PDF(source) elif ".html" in source.lower(): return HTML(source) else: return TEXT(source)
[docs] class AbstractSource(ABC): """ Abstract class representing a source file """ def __init__(self, source:str): self.source = source self.content = None
[docs] @abstractmethod def load(self) -> str: pass
def __eq__(self, other) -> bool: if not isinstance(other, AbstractSource): return False return self.source == other.source def __hash__(self): return hash(self.source)
[docs] class PDF(AbstractSource): """ PDF resource """ def __init__(self, source): super().__init__(source)
[docs] def load(self): raise Exception("Not implemented") # extract text from PDF, populate self.content self.content = None
[docs] class TEXT(AbstractSource): """ TEXT resource """ def __init__(self, source): super().__init__(source)
[docs] def load(self): if self.content is None: # populate self.content with open(self.source, 'r') as f: self.content = f.read() return self.content
[docs] class HTML(AbstractSource): """ HTML resource """ def __init__(self, source): super().__init__(source)
[docs] def load(self): with open(self.source, 'r') as f: self.content = f.read() # extract text from HTML, populate self.content soup = BeautifulSoup(self.content, 'html.parser') # Extract text from the HTML self.content = soup.get_text() # Remove extra newlines self.content = re.sub(r'\n{2,}', '\n', self.content)