Skip to content

Instantly share code, notes, and snippets.

@ahasha
Last active July 15, 2024 23:56
Show Gist options
  • Save ahasha/f2693f20d2f0e05c14e216e21b1b6e9f to your computer and use it in GitHub Desktop.
Save ahasha/f2693f20d2f0e05c14e216e21b1b6e9f to your computer and use it in GitHub Desktop.
Defining information extraction for LLMs
class Action(BaseModel):
"""Information about an action item described in the document."""
id: Optional[str] = Field(
description="If a Unique Identifier for the action is given in the text, reecord it here."
)
action_category: str = Field(
description="""
The Action Category the goal is associated with, e.g. Buildings, Transportation, Waste, Governance, Conservation, or Energy.
Select Energy only if no other more specific emission category is mentioned, or if the goal pertains specifically to electricity.
Governance is for goals related to municipal staffing, policies, or processes to support execution of the Climate Action Plan.
Should be a valid ActionCategory value:
"""
)
owner: Optional[List[str]] = Field(
default=None,
description="The entities or individuals responsible for the action, if mentioned in the text",
)
description: str = Field(description="A summary description of the action")
context: str = Field(
description="Verbatim text from the provided document on which the Action description is based"
)
context_page: int = Field(
description="The page number of the document that the context string was drawn from."
)
def get_extraction_results_with_backoff(chain, text):
try:
result = chain.invoke({"text": text})
except OutputParserException as e:
logger.error(f"A validation error occurred: {str(e)}")
logger.error("Retrying with content split in half")
# Split text in half and try twice...
first_half = text[: len(text) // 2]
second_half = text[len(text) // 2 :]
result1 = get_extraction_results_with_backoff(chain, first_half)
result2 = get_extraction_results_with_backoff(chain, second_half)
result = Results(
goals=result1.goals + result2.goals,
actions=result1.actions + result2.actions,
)
return result
class Goal(BaseModel):
"""Information about a strategic planning Goal.
Goals are quantified outcomes necessary to meet emissions targets and resilience goals.
If the goal does not mention a quantitative target and a target year, you should skip it or classify it as an Action instead.
"""
document_goal_id: Optional[str] = Field(
description="If a unique identifier for the Goal ID is given in the text record it here."
)
action_category: str = Field(
description="""
The Action Category the goal is associated with, e.g. Buildings, Transportation, Waste, Governance, Conservation, or Energy.
Select Energy only if no other more specific emission category is mentioned, or if the goal pertains specifically to electricity.
Governance is for goals related to municipal staffing, policies, or processes to support execution of the Climate Action Plan.
Should be a valid ActionCategory value: G, Z, B, E, T, W, C
"""
)
year: Optional[int] = Field(
default=None,
description="The year by which the goal should be achieved.",
ge=1990,
le=2100,
)
description: str = Field(
description="A summary description of the goal, which must include a quantitative target and a target year.",
)
context: str = Field(
description="Verbatim text from the provided document on which the Goal description is based"
)
context_page: int = Field(
description="The page number of the document that the context string was drawn from."
)
from langchain_openai import ChatOpenAI
class Results(BaseModel):
goals: Optional[List[Goal]] = []
actions: Optional[List[Action]] = []
llm = ChatOpenAI(model=llm_model, temperature=0)
prompt = get_prompt_template()
chain = prompt | llm.with_structured_output(schema=Results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment