Skip to content

Instantly share code, notes, and snippets.

@almugabo
almugabo / compare_tokenizer_gemma_llama.py
Created February 23, 2024 19:06
compare tokenizers of gemma and llama
import os
import pandas as pd
import random
from transformers import AutoTokenizer
xFld = '....' # folder with a bunch of text files
xtok_tl = AutoTokenizer.from_pretrained('unsloth/tinyllama-bnb-4bit')
xtok_ge = AutoTokenizer.from_pretrained('unsloth/gemma-2b-bnb-4bit')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 05:27:37 2024
@author: mike
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
@almugabo
almugabo / reference_extraction.py
Created June 3, 2021 04:29
extraction of references in structured form
"""
FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a
Process RIS format following the standard at",
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """
import re
ALLOWED_TAGS = {"TY" : "Record start",
"ER" : "Record end",
@almugabo
almugabo / get_photo_unsplash.py
Last active May 31, 2021 08:43
get picture from unsplash
# get photo from unsplash
# depends on python-unsplash
# !pip install python-unsplash
from unsplash.api import Api
from unsplash.auth import Auth
import requests
from PIL import Image
from io import BytesIO
@almugabo
almugabo / resnews_template_word.py
Last active May 29, 2021 08:30
resnews_template_word.py
# create style
from docx import Document
# create style
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import Inches, Pt
## TO DO
## [1] add an image in a cell
@almugabo
almugabo / resnews_templating.py
Created April 30, 2021 10:17
research news template
from string import Template
xDocHead = '''<!DOCTYPE html>
<html><head>
<meta http-equiv="content-type" content="text/html; charset=windows-1252">
<style>
.title_orange {
font-family: Verdana, Geneva, sans-serif;
font-size: 14px;
@almugabo
almugabo / open_patent_services.py
Created May 3, 2019 04:13
open patent services
import requests
from base64 import b64encode
import requests
import json
UrlAuth = 'https://ops.epo.org/3.2/auth/accesstoken'
UrlServiceBase = 'https://ops.epo.org/3.2/rest-services/published-data'
UrlServiceBaseSearch = UrlServiceBase + '/search/biblio/?q='
@almugabo
almugabo / HungarianMethod.py
Created March 24, 2019 19:22
Hungarian Method, wrapper around scipy.optimize.linear_sum_assignment
import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
def make_assignments(xDF):
'''
a simple wrapper around the
scipy.optimize.linear_sum_assignment
which implements the Hungarian Algorithm
@almugabo
almugabo / genMetaPaths.py
Created March 7, 2019 13:47
Metapaths2vec utilities
#!/usr/bin/env python
# coding: utf-8
# March 7th, 2019 (by mm)
## Generating paths (walks)
## inspired by the original py4genMetaPaths.py from
## but also : by stellargraph
## TO DO : create tests
import pandas as pd
import random
@almugabo
almugabo / PythonExcelConditionalFormat.py
Last active November 11, 2018 18:07
CONDITIONAL FORMATTING in Excel with python with xlsxwriter
# CONDITIONAL FORMATTING in Excel with python
# we could use OpenPyXL or xlsxwriter
# Here we use xlsxwriter
#https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
#https://xlsxwriter.readthedocs.io/example_conditional_format.html#ex-cond-format
import pandas as pd
import xlsxwriter