You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2687 lines
70 KiB

3 years ago
\documentclass[11pt,a4paper]{article}
%%\documentclass[8pt,a4paper,twocolumn]{article}
\usepackage[a4paper,left=1cm,right=1cm,top=1.0cm,bottom=2.5cm]{geometry}
\usepackage[turkish]{babel}
\usepackage{times}
\usepackage{graphicx}
\usepackage{natbib}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[utf8]{inputenc}
\usepackage{nomencl}
\usepackage{commath}
\usepackage{url}
\usepackage{subfig}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{epstopdf}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{paralist}
\usepackage{hyperref}
\usepackage{pythonhighlight}
\usepackage{listings}
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
\lstdefinestyle{mystyle}{
backgroundcolor=\color{backcolour},
commentstyle=\color{codegreen},
keywordstyle=\color{magenta},
numberstyle=\tiny\color{codegray},
stringstyle=\color{codepurple},
basicstyle=\footnotesize,
breakatwhitespace=false,
breaklines=true,
captionpos=b,
keepspaces=true,
numbers=left,
numbersep=5pt,
showspaces=false,
showstringspaces=false,
showtabs=false,
tabsize=2
}
\def\ExtendVersion{1}
\newcommand{\credit}[1]{Thanks to: #1}
\input{taypack.tex}
%turning on/off comments
\usepackage{comment}
\includecomment{comment} %show comments
%\excludecomment{comment} %do not show comments
\lstset{style=mystyle}
\begin{document}
\lstset{language=Python}
% first the title is needed
\title{\centering{Python: From Basics to the Extreme}}
% the name(s) of the author(s) follow(s) next
%\author{Keke\c{c}}
%\author{İbrâhim Taygun Keke\c{c}}
%\author{İbrâhim Taygun Keke\c{c}}
%\author{UL}
\maketitle
%\begin{abstract}
%\end{abstract}
\newcounter{Madde}[section]
\newenvironment{Madde}[1][]{\refstepcounter{Madde}\par\medskip
\textbf{Md ~\theMadde. #1} \rmfamily}{\medskip}
\newcommand{\citemd}[1]{(\textbf{M. {#1}})}
\tableofcontents
\section{Introduction}
This reference book contains very frequently used concepts in Python language.
The reader is assumed to be familiar with the abstract concepts of variables, loops, functions and such.
At each section, a working code example is provided.
\textbf{Correct usage of the material.}
\begin{itemize}
\item analyze the code example
\item copy/paste and run the example
\item check the output of the code whether it makes sense.
\end{itemize}
\section{Basics}
\subsection{Variables and Data Types}
You can declare integer and string variables.
\begin{python}
x = 5
y = "John"
print(x)
#5
print(y)
#John
\end{python}
Declare string, integer or float.
\begin{python}
x = str(3) # x will be '3'
y = int(3) # y will be 3
z = float(3) # z will be 3.0
print(type(x))
#<class 'str'>
print(type(y))
#<class 'int'>
\end{python}
Different variable types exist. But for now, focus on string, integer, float, list, tuple, dictionary, and sets.
\begin{python}
Text Type: str
Numeric Types: int, float, complex
Sequence Types: list, tuple, range
Mapping Type: dict
Set Types: set, frozenset
Boolean Type: bool
Binary Types: bytes, bytearray, memoryview
None Type: NoneType
\end{python}
List is a collection which is ordered and changeable. Allows duplicate members.
Tuple is a collection which is ordered and unchangeable. Allows duplicate members.
Set is a collection which is unordered, unchangeable, and unindexed. No duplicate members.
Dictionary is a collection which is ordered and changeable. No duplicate members.
\subsection{List}
List is a python data type. Lists can store multiple variables in a single variable.
\begin{python}
thislist = ["apple", "banana", "cherry"]
print(thislist)
#['apple', 'banana', 'cherry']
\end{python}
\subsubsection{access list items}
\begin{python}
thislist = ["apple", "banana", "cherry"]
print(thislist[1])
#banana
\end{python}
\subsubsection{change list item value}
\begin{python}
thislist = ["apple", "banana", "cherry"]
thislist[1] = "blackcurrant"
print(thislist)
#['apple', 'blackcurrant', 'cherry']
\end{python}
\subsubsection{change a range of list item values}
\begin{python}
thislist = ["apple", "banana", "cherry", "orange", "kiwi", "mango"]
thislist[1:3] = ["blackcurrant", "watermelon"]
print(thislist)
#['apple', 'blackcurrant', 'watermelon', 'orange', 'kiwi', 'mango']
\end{python}
\subsubsection{add list items}
\begin{python}
thislist = ["apple", "banana", "cherry"]
thislist.append("orange")
print(thislist)
#['apple', 'banana', 'cherry', 'orange']
\end{python}
\subsubsection{remove list items}
\begin{python}
thislist = ["apple", "banana", "cherry"]
thislist.remove("banana")
print(thislist)
#['apple', 'cherry']
\end{python}
\paragraph{remove duplicates from the list}
If new element is seen, first adds it to the list, and then returns it.
\begin{python}
a = [1,2,3,2,1,5,6,5,5,5]
seen = set()
uniq = [x for x in a if x not in seen and not seen.add(x)]
print(uniq)
#[1, 2, 3, 5, 6]
\end{python}
\subsubsection{looping a list}
\begin{python}
thislist = ["apple", "banana", "cherry"]
for x in thislist:
print(x)
#apple
#banana
#cherry
\end{python}
\subsubsection{list slicing}
you can access multiple elements with slicing operation.
\begin{python}
thislist = ["apple", "banana", "cherry", "watermelon", "grape", "kiwi"]
print(thislist[1:4] )
#['banana', 'cherry', 'watermelon']
## Here, we access each 2nd element from 1st to 6th element of the list.
print(thislist[1:6:2])
#['banana', 'watermelon', 'kiwi']
## Reverse slicing: access elements from end to beginning of the list.
print(thislist[6:1:-1])
#['kiwi', 'grape', 'watermelon', 'cherry']
\end{python}
\subsubsection{list comprehension}
This powerful concept can single-line the traditional loops.
\begin{python}
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]
newlist = []
for x in fruits:
if "a" in x:
newlist.append(x)
print(newlist)
# ['apple', 'banana', 'mango']
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]
newlist = [x for x in fruits if "a" in x]
print(newlist)
# ['apple', 'banana', 'mango']
\end{python}
\subsubsection{list comprehension: condition + operation}
\begin{python}
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]
newlist = [x.upper() for x in fruits]
print(newlist)
#['APPLE', 'BANANA', 'CHERRY', 'KIWI', 'MANGO']
\end{python}
\subsubsection{sorting a list}
\begin{python}
thislist = ["orange", "mango", "kiwi", "pineapple", "banana"]
thislist.sort()
print(thislist)
#['banana', 'kiwi', 'mango', 'orange', 'pineapple']
\end{python}
\subsubsection{copying a list}
\begin{python}
thislist = ["apple", "banana", "cherry"]
mylist = thislist.copy()
print(mylist)
#['apple', 'banana', 'cherry']
\end{python}
\subsubsection{join two lists}
\begin{python}
list1 = ["a", "b", "c"]
list2 = [1, 2, 3]
list3 = list1 + list2
print(list3)
#['a', 'b', 'c', 1, 2, 3]
\end{python}
\subsubsection{looping through multiple Lists}
\begin{python}
a = ['a1', 'a2', 'a3']
b = ['b1', 'b2']
for x, y in zip(a, b):
print(x, y)
#a1 b1
#a2 b2
\end{python}
\subsubsection{other list methods}
\begin{python}
Method Description
append() Adds an element at the end of the list
clear() Removes all the elements from the list
copy() Returns a copy of the list
count() Returns the number of elements with the specified value
extend() Add the elements of a list (or any iterable), to the end of the current list
index() Returns the index of the first element with the specified value
insert() Adds an element at the specified position
pop() Removes the element at the specified position
remove() Removes the item with the specified value
reverse() Reverses the order of the list
sort() Sorts the list
\end{python}
\subsection{Dictionary}
Unlike lists, in dictionary each entry has a key, and a value. These key, value pairs constitute the dictionary content.
\begin{python}
thisdict = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
print(thisdict)
#{'brand': 'Ford', 'model': 'Mustang', 'year': 1964}
\end{python}
\subsubsection{access dictionary values}
\begin{python}
thisdict = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
print(thisdict["brand"])
#Ford
\end{python}
\subsubsection{access dictionary keys or values}
\begin{python}
car = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
x = car.keys()
print(x)
#dict_keys(['brand', 'model', 'year'])
v = car.values()
print(v)
#dict_values(['Ford', 'Mustang', 1964])
\end{python}
\subsubsection{inverse dictionary lookup}
\begin{python}
d = {"a":0, "b":1, "c":2}
dict(zip(d.values(), d.keys()))
#{0: 'a', 1: 'b', 2: 'c'}
\end{python}
\subsubsection{change dictionary items}
\begin{python}
car = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
x = car.items()
print(x) #before the change
#dict_items([('brand', 'Ford'), ('model', 'Mustang'), ('year', 1964)])
car["year"] = 2020
print(x) #after the change
#dict_items([('brand', 'Ford'), ('model', 'Mustang'), ('year', 2020)])
\end{python}
\subsubsection{add dictionary items}
\begin{python}
thisdict = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
thisdict["color"] = "red"
print(thisdict)
#{'brand': 'Ford', 'model': 'Mustang', 'year': 1964, 'color': 'red'}
\end{python}
\subsubsection{remove dictionary item}
\begin{python}
thisdict = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
del thisdict["model"]
print(thisdict)
#{'brand': 'Ford', 'year': 1964}
\end{python}
\subsubsection{looping a dictionary}
\begin{python}
thisdict={'brand': 'Ford', 'year': 1964}
for x, y in thisdict.items():
print(x, y)
#brand Ford
#year 1964
\end{python}
\subsubsection{copy a dictionary}
\begin{python}
thisdict = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
mydict = thisdict.copy()
print(mydict)
#{'brand': 'Ford', 'model': 'Mustang', 'year': 1964}
\end{python}
\subsubsection{intersections of two dictionaries}
\begin{python}
some_dict = {'zope':'zzz', 'python':'rocks' }
another_dict = {'python':'rocks', 'perl':'interesting' }
print "Intersects:", [k for k in some_dict if k in another_dict]
#python
\end{python}
\subsubsection{other dictionary operations}
\begin{python}
clear() Removes all the elements from the dictionary
copy() Returns a copy of the dictionary
fromkeys() Returns a dictionary with the specified keys and value
get() Returns the value of the specified key
items() Returns a list containing a tuple for each key value pair
keys() Returns a list containing the dictionary's keys
pop() Removes the element with the specified key
popitem() Removes the last inserted key-value pair
setdefault() Returns the value of the specified key. If the key does not exist: insert the key, with the specified value
update() Updates the dictionary with the specified key-value pairs
values() Returns a list of all the values in the dictionary
\end{python}
\subsection{Sets}
Python sets are like lists. They are used to store multiple items in a single variable.Set elements are unordered. Items are unchangeable, can't be altered after creation. But we can add and remove elements to the set. Unlike lists, they can't have two identical elements.
\begin{python}
thisset = {"apple", "banana", "cherry"}
print(thisset)
#{'apple', 'cherry', 'banana'}
##length
print(len(thisset))
#3
\end{python}
Sets can have different objects as elements.
\begin{python}
set1 = {"abc", 34, True, 40, "male"}
\end{python}
\subsubsection{access elements}
\begin{python}
thisset = {"apple", "banana", "cherry"}
for x in thisset:
print(x)
#apple
#cherry
#banana
\end{python}
\subsubsection{check element existence}
\begin{python}
thisset = {"apple", "banana", "cherry"}
print("banana" in thisset)
#True
\end{python}
\subsubsection{add elements}
\begin{python}
thisset = {"apple", "banana", "cherry"}
thisset.add("orange")
print(thisset)
#{'orange', 'apple', 'cherry', 'banana'}
\end{python}
\subsubsection{remove elements}
We can remove the set elements with remove() function.
\begin{python}
thisset = {"apple", "banana", "cherry"}
thisset.remove("banana")
print(thisset)
#{'apple', 'cherry'}
\end{python}
Alternatively we can remove the last element using the pop() function.
\begin{python}
thisset = {"apple", "banana", "cherry"}
x = thisset.pop()
print(x)
#apple
print(thisset)
#{'cherry', 'banana'}
\end{python}
\subsubsection{looping sets}
\begin{python}
thisset = {"apple", "banana", "cherry"}
for x in thisset:
print(x)
#apple
#cherry
#banana
\end{python}
\subsubsection{joining/combining/ (union) of sets}
\begin{python}
set1 = {"a", "b" , "c"}
set2 = {1, 2, 3}
set3 = set1.union(set2)
print(set3)
#{'a', 'b', 1, 2, 3, 'c'}
\end{python}
\subsubsection{merge (intersection) of sets}
\begin{python}
x = {"apple", "banana", "cherry"}
y = {"google", "microsoft", "apple"}
z = x.intersection(y)
print(z)
#{'apple'}
\end{python}
\subsection{Tuples}
Tuples are used to store multiple items in a single variable.
Elements of tuples are ordered and unchangeable.
\begin{python}
thistuple = ("apple", "banana", "cherry")
print(thistuple)
#('apple', 'banana', 'cherry')
print(len(thistuple))
#3
## a tuple with different objects
tuple1 = ("abc", 34, True, 40, "male")
\end{python}
\subsubsection{accessing elements}
\begin{python}
thistuple = ("apple", "banana", "cherry")
print(thistuple[1])
#banana
##negative indexing
print(thistuple[-1])
#cherry
##slicing
thistuple = ("apple", "banana", "cherry", "orange", "kiwi", "melon", "mango")
print(thistuple[2:5])
#('cherry', 'orange', 'kiwi')
print(thistuple[:4])
#('apple', 'banana', 'cherry', 'orange')
\end{python}
\subsubsection{check element existence}
\begin{python}
thistuple = ("apple", "banana", "cherry")
if "apple" in thistuple:
print("Yes, 'apple' is in the fruits tuple")
#Yes, 'apple' is in the fruits tuple
\end{python}
\subsubsection{unpack elements}
\begin{python}
fruits = ("apple", "banana", "cherry", "strawberry", "raspberry")
(green, yellow, *red) = fruits
print(green)
#apple
print(yellow)
#banana
print(red)
#['cherry', 'strawberry', 'raspberry']
\end{python}
\subsubsection{looping elements}
\begin{python}
thistuple = ("apple", "banana", "cherry")
for x in thistuple:
print(x)
#apple
#banana
#cherry
\end{python}
\subsubsection{joining two tuples}
\begin{python}
tuple1 = ("a", "b" , "c")
tuple2 = (1, 2, 3)
tuple3 = tuple1 + tuple2
print(tuple3)
#('a', 'b', 'c', 1, 2, 3)
\end{python}
\subsubsection{multiply tuples}
\begin{python}
fruits = ("apple", "banana", "cherry")
mytuple = fruits * 2
print(mytuple)
#('apple', 'banana', 'cherry', 'apple', 'banana', 'cherry')
\end{python}
\subsection{If conditional}
If conditional checks for cases in your program.
\begin{python}
if 5 > 2:
print("Five is greater than two!")
#Five is greater than two!
\end{python}
\subsubsection{Elif conditional}
\begin{python}
a = 33
b = 33
if b > a:
print("b is greater than a")
elif a == b:
print("a and b are equal")
#a and b are equal
\end{python}
\subsubsection{Else-If conditional}
\begin{python}
a = 200
b = 33
if b > a:
print("b is greater than a")
elif a == b:
print("a and b are equal")
else:
print("a is greater than b")
#a is greater than b
\end{python}
\subsubsection{Short If}
\begin{python}
if a > b: print("a is greater than b")
#a is greater than b
\end{python}
\subsubsection{Short If-Else}
\begin{python}
a = 2
b = 330
print("A") if a > b else print("B")
# B
\end{python}
\subsubsection{And or conditional}
\begin{python}
a = 200
b = 33
c = 500
if a > b and c > a:
print("Both conditions are True")
# Both conditions are True
if a > b or a > c:
print("At least one of the conditions is True")
#At least one of the conditions is True
\end{python}
\subsubsection{Nested if}
With nested if's you can create branches in your program.
\begin{python}
x = 41
if x > 10:
print("Above ten,")
if x > 20:
print("and also above 20!")
else:
print("but not above 20.")
#Above ten,
#and also above 20!
\end{python}
\subsubsection{Pass}
if statements cannot be empty, if you need to have empty statment, use pass statement to avoid getting an error.
\begin{python}
a = 33
b = 200
if b > a:
pass
\end{python}
\subsection{While Loop}
\begin{python}
i = 1
while i < 4:
print(i)
i += 1
#1
#2
#3
\end{python}
\subsubsection{Break statement}
You can exit the loops immediately with break statement.
\begin{python}
i = 1
while i < 6:
print(i)
if i == 3:
break
i += 1
#1
#2
#3
\end{python}
\subsubsection{Continue statement}
With continue statement we can stop the current iteration, and continue with the next.
\begin{python}
i = 0
while i < 6:
i += 1
if i == 3:
continue
print(i)
#1
#2
#4
#5
#6
\end{python}
\subsubsection{While-Else statement}
\begin{python}
i = 1
while i < 6:
print(i)
i += 1
else:
print("i is no longer less than 6")
#1
#2
#3
#4
#5
#i is no longer less than 6
\end{python}
\subsection{For Loop}
\begin{python}
fruits = ["apple", "banana", "cherry"]
for x in fruits:
print(x)
#apple
#banana
#cherry
\end{python}
\subsubsection{Looping integers}
\begin{python}
for x in range(4):
print(x)
#0
#1
#2
#3
\end{python}
\subsubsection{Looping a string variable}
\begin{python}
for x in "car":
print(x)
#c
#a
#r
\end{python}
\subsubsection{breaking the loop}
\begin{python}
fruits = ["apple", "banana", "cherry"]
for x in fruits:
print(x)
if x == "banana":
break
#apple
#banana
\end{python}
\subsubsection{for loop nested}
\begin{python}
adj = ["red", "big", "tasty"]
fruits = ["apple", "banana", "cherry"]
for x in adj:
for y in fruits:
print(x, y)
#red apple
#red banana
#red cherry
#big apple
#big banana
#big cherry
#tasty apple
#tasty banana
#tasty cherry
\end{python}
\subsubsection{for loop pass statement}
\begin{python}
for x in [0, 1, 2]:
pass
\end{python}
\subsection{Function}
In python, you can declare functions. Functions are code pieces that you can execute multiple times easily.
\subsubsection{defining a function}
\begin{python}
# define the function
def my_function():
print("Hello from a function")
\end{python}
\subsubsection{calling a function}
\begin{python}
def my_function():
print("Hello from a function")
my_function()
#Hello from a function
\end{python}
\subsubsection{giving function an argument}
\begin{python}
def my_function(fname):
print(fname + " Refsnes")
my_function("Emil")
#Emil Refsnes
my_function("Tobias")
#Tobias Refsnes
my_function("Linus")
#Linus Refsnes
\end{python}
\subsubsection{giving function multiple arguments}
\begin{python}
def my_function(fname, lname):
print(fname + " " + lname)
my_function("Emil", "Refsnes")
#Emil Refsnes
\end{python}
\subsubsection{giving function with default parameters}
\begin{python}
def my_function(country = "Norway"):
print("I am from " + country)
my_function("Sweden")
#I am from Sweden
my_function("India")
#I am from India
my_function()
#I am from Norway
my_function("Brazil")
#I am from Brazil
\end{python}
\subsubsection{giving function unknown number of parameters}
\begin{python}
def my_function(*kids):
print("The youngest child is " + kids[2])
my_function("Emil", "Tobias", "Linus")
#The youngest child is Linus
\end{python}
\subsubsection{function returning values}
\begin{python}
def my_function(x):
return 5 * x
print(my_function(3))
#15
print(my_function(5))
#25
print(my_function(9))
#45
\end{python}
\subsubsection{function returning multiple values}
\begin{python}
def my_function(x):
return 5 * x, x * x
print(my_function(3))
#(15, 9)
print(my_function(10))
#(50, 100)
print(type(my_function(10)))
#<class 'tuple'>
\end{python}
\subsubsection{shortcut function: lambda}
Lambda functions are one-liner functions. Sometimes you need to define such functions inside others. This gives very much ease.
\begin{python}
x = lambda a : a + 10
print(x(5))
#15
x = lambda a, b : a * b
print(x(5, 6))
#30
x = lambda a, b, c : a + b + c
print(x(5, 6, 2))
#13
\end{python}
\subsection{Modules}
Python modules are libraries (set of functions) written by others. Using these code pieces help you to re-use written code.
\subsubsection{writing a module}
Save the following to a file named mymodule.py .
\begin{python}
def greeting(name):
print("Hello, " + name)
person1 = {
"name": "John",
"age": 36,
"country": "Norway"
}
\end{python}
\subsubsection{use functions of a module}
\begin{python}
import mymodule
mymodule.greeting("Jonathan")
#Hello Jonathan
\end{python}
\subsubsection{custom naming a module}
\begin{python}
import mymodule as mx
a = mx.person1["age"]
print(a)
#36
\end{python}
\subsubsection{partial import of a module}
Sometimes, you just need a function, or an object from a module package. In these cases, you only import that part of the module. No need to import whole module. This is a better practice.
\begin{python}
from mymodule import person1
print (person1["age"])
#36
\end{python}
\subsubsection{listing functions of the module}
You can list all the function names in a module with dir command.
\begin{python}
import platform
x = dir(platform)
# too long to write the output... check your program.
\end{python}
\subsection{Classes}
%Python supports object oriented programming (OOP). In default, what we are doing is functional programming. That is,
%the program is a composition of functions. In OOP, similar to the real world, we have objects. These objects are represented by classes in the program.
%A class has attributes, and methods (in other word class functions). We create instances from that class using class constructor.
\begin{python}
class Person:
def __init__(mysillyobject, name, age):
mysillyobject.name = name
mysillyobject.age = age
def myfunc(abc):
print("Hello my name is " + abc.name)
p1 = Person("John", 36)
p1.myfunc()
\end{python}
\subsection{Uncategorized}
\subsubsection{add padding to opencv image}
\begin{python}
#cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
padSize = 500
ib = cv2.copyMakeBorder(img_dilated, padSize, padSize, padSize, padSize, cv2.BORDER_CONSTANT, (0,0,0))
\end{python}
\subsubsection{find neighboring pairs in list}
\begin{python}
A = [1, 2, 3, 4]
B = [(i,j) for i,j in zip(A, A[1:])]
#[(1, 2), (2, 3), (3, 4)]
\end{python}
\subsubsection{find most frequent elements in list}
\begin{python}
from collections import Counter
mylist = [1,1,1,1,3,4,5,5,5,6,6,7,8,9]
counter = Counter(mylist)
most_common = counter.most_common(2)
print(most_common)
#[(1, 4), (5, 3)] # 4 ones, 3 fives seen.
\end{python}
\subsubsection{cartesian product of two lists}
\begin{python}
import itertools
a = [1,2,3]
b = [4,5,6]
for i in itertools.product( a, b ):
print(i)
\end{python}
\subsubsection{pad integer to have zeros}
\begin{python}
def NDigited(x,n=3):
return (n-len(x)) * '0' + x
\end{python}
\paragraph{optimized version}
\credit{jnmbk}
\begin{python}
def NDigited(x,n=3)
return x.zfill(n)
\end{python}
\subsubsection{matplotlib display strings on y-axis}
\begin{python}
yticks(np.arange(5), ('String1', 'String2', 'String3', '4', '5'))
\end{python}
\subsubsection{broadcast image or matrix channels}
This operation is usually needed to go from 1d image to 3d image.
\begin{python}
nChannels = 3
m3d = np.repeat( m.reshape(m.shape[0], m.shape[1], 1), nChannels, axis=2)
\end{python}
Another identical way to do it.
\begin{python}
nChannels = 3
m3d = np.tile(m[:, :, None], [1, 1, nChannels])
\end{python}
\subsubsection{1d interpolation of x-y values}
Given a set of 2D points, we fit a curve to these points.
\begin{python}
xdata = [0,1,2,3,4,5]
ydata = [0,1,4,9,16,25]
f2 = interp1d(xdata, ydata, kind = 'quadratic')
xnew = np.linspace(-5, 5, 1000)
ynew = f2(xnew)
\end{python}
\subsection{useful numpy functions}
\begin{python}
## remove empty dimension
x = np.array([[[0], [1], [2]]])
print(x.shape)
#(1, 3, 1)
dd = np.squeeze(cc)
print(x.shape)
#(3,)
## vertically stack list of matrices
x = np.dstack( possibleCurves )
## randomly choose 5 values from the interval 0-100
randIdxs = np.random.choice(100 , 5, replace=False)
## reshape 1D data for one feature problems
X = x.reshape(-1,1)
\end{python}
\section{String Manipulation, Searching, Sorting}\label{sect:strings}
\subsection{substring search}
\begin{python}
word = 'cart for supermarket'
##substring search: find first occurrence
result = word.find('supermarket')
print("Substring 'geeks' found at index:", result)
#Substring 'geeks' found at index: 9
##substring search with start end specification: searched in 'for su'.
print(word.find('su', 4, 12))
#9
\end{python}
\subsubsection{string between two substrings}
\begin{python}
import re
s = 'asdf=5;iwantthis123jasd'
result = re.search('asdf=5;(.*)123jasd', s)
print(result.group(1))
#iwantthis
\end{python}
\subsubsection{Create index for strings}
\begin{python}
a = ['a', 'b', 'c']
b = dict(map(lambda t: (t[1], t[0]), enumerate(a)))
#{'a':0, 'b':1, 'c':2}
\end{python}
\subsection{string concatenation}
\begin{python}
s1 = "myStrFirst"
s2 = "secondString"
s3 = s1 + " " + s2
print(s3)
#myStrFirst secondString
\end{python}
\subsection{string splitting}
\begin{python}
## simple string splitting
txt = "apple#banana#cherry#orange"
x = txt.split("#")
print(x)
#['apple', 'banana', 'cherry', 'orange']
## setting the maxsplit parameter to 1, will return a list with 2 elements!
txt = "apple#banana#cherry#orange"
x = txt.split("#", 1)
#['apple', 'banana#cherry#orange']
\end{python}
\subsection{stripping string}
Remove leading and trailing spaces and specific characters at the beginning and at the end of a string.
\begin{python}
txt = " banana sss "
x = txt.strip()
print("of all fruits", x, "is my favorite")
#of all fruits banana is my favorite
txt = ",,,,,rrttgg.....banana....rrr"
x = txt.strip(",.grt")
print(x)
#banana
\end{python}
\subsection{combining list of strings}
\begin{python}
text = ['Python', 'is', 'a', 'fun', 'programming', 'language']
print(' '.join(text))
# Python is a fun programming language
\end{python}
\section{Input Output Operations}\label{sect:io}
\subsection{create a file}
\begin{python}
f = open("demofile3.txt", "w")
f.write("I have added content!")
f.close()
\end{python}
\subsection{write to a file: fast shortcut}
In this version, you don't have to remember closing the file.
\begin{python}
with open("demofile3.txt", "w") as fp:
fp.write(f.write("I have added content!")
\end{python}
\subsection{create directory}
The following program checks for a directory, and creates it if not present.
\begin{python}
import os
directory = "newDirectory"
parent_dir = "/home/User/Documents"
path = os.path.join(parent_dir, directory)
os.makedirs(path,exist_ok = True)
\end{python}
\subsection{remove file}
\begin{python}
import os
fileName = 'myFile.txt' # File name
location = "/home/User/Documents"
path = os.path.join(location, fileName)
if os.path.exists(path):
os.remove(path)
else:
print("The file does not exist")
\end{python}
\subsection{save and load pickle file}
Pickle is the default binary storage format of Python. It can store any type of variable inside.
\begin{python}
import pickle
a = {'hello': 'world'}
with open('filename.pkl', 'wb') as handle:
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('filename.pkl', 'rb') as handle:
b = pickle.load(handle)
print(a == b)
\end{python}
\begin{python}
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
def write_obj(name, data):
with open(name, 'wb') as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
\end{python}
\subsection{joining paths}
\begin{python}
import os
path = "/home"
## Join various path components
print(os.path.join(path, "User/Desktop", "file.txt"))
#/home/User/Desktop/file.txt
\end{python}
\subsection{check file existence}
\begin{python}
path = 'D:/Pycharm/USER/testFile.txt'
isFile = os.path.isfile(path)
print(isFile)
#False
\end{python}
\subsection{list files in a directory}
\begin{python}
import os
path = "/"
dir_list = os.listdir(path)
print(dir_list)
# too many output, please run the code yourself.
\end{python}
\subsection{iterate (traverse) files in a folder}
\begin{python}
import os
for (root,dirs,files) in os.walk('Test', topdown=true):
print (root)
print (dirs)
print (files)
# too many output, please run the code yourself.
\end{python}
\subsection{sort files by date}
\begin{python}
import os
search_dir = "/mydir/"
files = os.listdir(search_dir)
files = [os.path.join(search_dir, f) for f in files]
files.sort(key=lambda x: os.path.getmtime(x))
\end{python}
\subsection{write to CSV file}
\begin{python}
import csv
with open('employee_file.csv', mode='w') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"')
employee_writer.writerow(['name', 'department', 'birthday month'])
employee_writer.writerow(['John Smith', 'Accounting', 'November'])
employee_writer.writerow(['Erica Meyers', 'IT', 'March'])
employee_writer.writerow(['Monica Barker', 'HR', 'December'])
## open the employee_file.csv and you will see:
## name,department,birthday month
## John Smith,Accounting,November
## Erica Meyers,IT,March
## Monica Barker,HR,December
\end{python}
\subsection{read CSV file example}
\begin{python}
import csv
with open('employee_file.csv', mode='w') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"')
employee_writer.writerow(['name', 'department', 'birthday month'])
employee_writer.writerow(['John Smith', 'Accounting', 'November'])
employee_writer.writerow(['Erica Meyers', 'IT', 'March'])
employee_writer.writerow(['Monica Barker', 'HR', 'December'])
with open('employee_birthday.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
print(f'Column names are {", ".join(row)}')
line_count += 1
else:
print(f'\t{row[0]} works in the {row[1]} department, and was born in {row[2]}.')
line_count += 1
print(f'Processed {line_count} lines.')
\end{python}
\subsubsection{read CSV into list}
\begin{python}
import csv
with open('employee_file.csv', mode='w') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"')
employee_writer.writerow(['name', 'department', 'birthday month'])
employee_writer.writerow(['John Smith', 'Accounting', 'November'])
employee_writer.writerow(['Erica Meyers', 'IT', 'March'])
employee_writer.writerow(['Monica Barker', 'HR', 'December'])
##
import csv
def readCSVIntoList(fileName, discardHeader=False):
rows = []
with open(fileName) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
rows.append(row)
if discardHeader:
rows.remove(rows[0])
return rows
rows = readCSVIntoList('employee_birthday.txt',discardHeader=True)
print(rows)
#[['John Smith', 'Accounting', 'November'], ['Erica Meyers', 'IT', 'March'], ['Monica Barker', 'HR', 'December']]
\end{python}
\subsection{Adding Command Line Arguments}
The following program expects command line arguments. If not provided, uses default arguments.
\begin{python}
## run with python sourcefile.py --keyword mykeyword --page 1
## or python sourcefile.py
import argparse
parser = argparse.ArgumentParser(description="Just an example",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-k", "--keyword", type=str, help="query keywords")
parser.add_argument("-p", "--page", type=str, help="query page")
args = parser.parse_args()
config = vars(args)
#print(config)
if args.keyword:
searchKeyword=args.keyword
searchPage=args.pageidx
else:
searchKeyword = "googleit"
searchPage="0"
print("SearchKeyword is " + searchKeyword + " Search Page is " + searchPage )
\end{python}
\section{Time and Date}
\subsection{Get current date }
\begin{python}
##Get the current date in DD-MM-YYYY-HR-MM-SS format:
from datetime import datetime
now = datetime.now()
curDate = now.strftime("%d-%m-%Y-%H-%M-%S")
print(curDate)
# 18-06-2022-10-40-59
\end{python}
\subsection{Convert Unix time to datetime}
\begin{python}
from datetime import datetime
x = 1656100252345
d = datetime.utcfromtimestamp( x / 1000 ).strftime('%Y-%m-%d %H:%M:%S')
#2022-06-24 19:50:52
\end{python}
\section{Algorithms}
\subsection{Remove elements}
\subsection{Find middle coordinates of a coordinate array}
\begin{python}
[ (linesProc2[i] + linesProc2[i+1]) / 2 for i,x in enumerate( linesProc2[:-1] ) ]
\end{python}
\section{Regex}
\subsection{Nongreedy regex search}
Default behaviour of regex is to greedy matching (searches the longest sequence up to the end). To search nongreedy:
\begin{python}
text="From: test: test",
regex="^F.+:" -> match="From: test:"
regex="^F.+?:" -> match="From:"
\end{python}
\section{Network}\label{sect:network}
\subsection{Single threaded to multi threaded}
Python programs are by default single threaded. This source is a multi-threaded example:
\begin{python}
### ---------------------- ###
### The following program is single threaded, it takes approximately six seconds.
### ---------------------- ###
from time import sleep, perf_counter
def task():
print('Starting a task...')
sleep(3)
print('done')
start_time = perf_counter()
task()
task()
end_time = perf_counter()
print(f'It took {end_time- start_time: 0.2f} second(s) to complete.')
# result depends on your cpu. please run the code!
### ---------------------- ###
### The following program is multi-threaded and it takes approximately 3 seconds.
### ---------------------- ###
from time import sleep, perf_counter
from threading import Thread
def task():
print('Starting a task...')
sleep(3)
print('done')
start_time = perf_counter()
# create two new threads
t1 = Thread(target=task)
t2 = Thread(target=task)
# start the threads
t1.start()
t2.start()
# wait for the threads to complete
t1.join()
t2.join()
end_time = perf_counter()
print(f'It took {end_time- start_time: 0.2f} second(s) to complete.')
# result depends on your cpu. please run the code!
\end{python}
\subsection{Multi-thread with argument}
\begin{python}
from time import sleep, perf_counter
from threading import Thread
def task(id):
print(f'Starting the task {id}...')
sleep(1)
print(f'The task {id} completed')
start_time = perf_counter()
## create and start 10 threads
threads = []
for n in range(1, 11):
t = Thread(target=task, args=(n,))
threads.append(t)
t.start()
## wait for the threads to complete
for t in threads:
t.join()
end_time = perf_counter()
print(f'It took {end_time- start_time: 0.2f} second(s) to complete.')
# Please run the program on your computer to see the output!
\end{python}
\section{Web}\label{sect:web}
\subsection{Scrape HTML with Beautiful Soup}
This example code scrapes an HTML page and searches for HTML div tags inside it.
\begin{python}
import requests
from bs4 import BeautifulSoup
URL = "https://edition.cnn.com/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
## print the HTML content
print(soup.prettify())
## find first div element in the page
myDiv = soup.find("div")
## find all div elements in the page
myDivs = soup.findAll("div")
## find all divs with class equal to the following string.
job_elements = results.find_all("div", attrs={"class":"card-content"} )
## get tag of an "a" element.
r.find('a')['href']
\end{python}
\subsection{Make request until success}
\begin{python}
def retryResponseGetSoup(url):
response = get(url)
bs = BeautifulSoup(response.content)
while bs.text == 'Baglanti hatasi.' or response.status_code != 200:
print('retrying...')
time.sleep(5)
response = get(url)
bs = BeautifulSoup(response.content)
return bs
url = "www.google.com"
s = retryResponseGetSoup(url)
\end{python}
\subsection{Selenium}
\subsubsection{Access attribute of an element}
\begin{python}
### selenium python scroll to element's location
elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'navigationPane')]/a")
for e in elements:
e.get_attribute("outerHTML")
\end{python}
\subsubsection{Scroll to element}
\begin{python}
### selenium python scroll to element's location
desired_y = element.location['y']
current_y = (driver.execute_script('return window.innerHeight') / 2) + driver.execute_script('return window.pageYOffset')
scroll_y_by = desired_y
driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
\end{python}
\subsubsection{Access pure HTML of the element}
\begin{python}
tableElems = driver.find_element(By.XPATH, "//table[contains(@class, 'morphologyTable')]//tbody")
tableElems.get_attribute("outerHTML")
\end{python}
\subsubsection{Save cropped screenshot}
\begin{python}
driver.save_screenshot('shot.png')
im = Image.open('shot.png')
im = im.crop((int(x1-5),int(0), int(x1+w1+5), int(y2-y1+h2)))
im.save('shot.png')
\end{python}
\subsection{Download files}
\begin{python}
import requests
image_url = "https://www.python.org/static/community_logos/python-logo-master-v3-TM.png"
r = requests.get(image_url)
with open("python_logo.png",'wb') as f:
f.write(r.content)
\end{python}
\section{Pandas}
[describe pandas library with 1-2 sentences. Then why its useful with 1-2 sentences.]
\subsection{read CSV }
We can use Pandas library to read CSV's easily. The content goes to a DataFrame type of the Pandas library.
\begin{python}
with open('employee_file.csv', mode='w+') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"')
employee_writer.writerow(['name', 'department', 'birthday month'])
employee_writer.writerow(['John Smith', 'Accounting', 'November'])
employee_writer.writerow(['Erica Meyers', 'IT', 'March'])
employee_writer.writerow(['Monica Barker', 'HR', 'December'])
import pandas as pd
data= pd.read_csv("employee_file.csv")
print(data)
# name department birthday month
#0 John Smith Accounting November
#1 Erica Meyers IT March
#2 Monica Barker HR December
\end{python}
\subsection{analyze and clean data }
\begin{python}
with open('employee_file.csv', mode='w+') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"')
employee_writer.writerow(['name', 'department', 'birthday month','salary','gender'])
employee_writer.writerow(['John Smith', 'Accounting', 'November', '100', 'm'])
employee_writer.writerow(['Erica Meyers', 'IT', 'March', '200', 'f'])
employee_writer.writerow(['Monica Barker', '', '', '400', 'f'])
import pandas as pd
df= pd.read_csv("employee_file.csv")
## print head of the data
print(df.head(10))
## print tail of the data
print(df.tail())
## info about the data
print(df.info())
## get column data types
df.dtypes
## drop empty rows
new_df = df.dropna()
print(new_df.to_string())
## drop unnecessary columns
df.drop(columns=['gender'])
## replace empty places
df.fillna(130, inplace = True)
## only replace specific columns
df["salary"].fillna(130, inplace = True)
## substitute column mean to the empty places
x = df["salary"].mean()
df["salary"].fillna(x, inplace = True)
## substitute column median to the empty places
x = df["salary"].median()
df["salary"].fillna(x, inplace = True)
## substitute column mode to the empty places
x = df["salary"].mode()[0]
df["salary"].fillna(x, inplace = True)
## remove rows with
df.dropna(subset=['department'], inplace = True)
\end{python}
\subsection{basic functionality: access, sampling, filtering}
\begin{python}
import pandas as pd
df = pd.read_csv("../pokemon_data.txt", delimiter="\t")
## Read headers
df.columns
## list the frequency of each Generation field
df['Generation'].value_counts()
## give how many uniques are in the dataset
df["Generation"].nunique()
## Read each column
df["Speed"]
df["Speed"][0:5]
df.ColumnName
df[ ["Speed", "HP"] ]
## Read each row
df.iloc[1]
df.iloc[1:4]
[row for index, row in df.iterrows()]
## Read a specific location (R,C)
df.iloc[2,1]
## Select rows
df.loc[ df["Type 1" == "Fire"]
## Sorting data
df.sort_values("Speed")
df.sort_values("Speed", ascending=False)
df.sort_values(["HP", "Speed"], ascending[1,0])
## Making changes to the data
df['Total'] = df['Total'] - 5
df['Total'] = df[ ['CA','CB','CC'].mean()
def f(x,y):
return x+y
## Iterating over one column
result = [x for x in df['End']]
## Iterating over two columns, use `zip`
result = [f(x, y) for x, y in zip(df['Start'], df['End'])]
## Iterating over multiple columns - same data type
result = [f(row[0], row[1]) for row in df[['Start', 'End']].to_numpy()]
## Iterating over multiple columns - differing data type
result = [f(row[0], row[1]) for row in zip(df['Start'], df['End'])]
## worst and very slow solutions. Avoid.
for row in df.itertuples():
print(row)
##Removing columns
df.drop( columns=['Total'], in_place=True)
##Removing columns conditionally
df.query("salary > 20")
## Summing a dataframe
df.sum(axis=1)
## Drop repeating entries
df.drop_duplicates(inplace = True)
## save results to csv
df.to_csv("myCsvFile.csv", index=False)
\end{python}
\subsection{advanced: multi column access, contains, groupby}
\begin{python}
#####################
### Advanced
import pandas as pd
df = pd.read_csv("../pokemon_data.txt", delimiter="\t")
df.loc[ df["Type 1" == "Grass" ]
## Sample using multiple condition
new_df = (df.loc[ df["Type 1" == "Grass" ]) & df.loc[ df["Type 2" == "Poison" ]
## After filtering, index stays. You have to reset index then.
new_df.reset_index(in_place=True)
new_df.reset_index(drop=True, in_place=True) # removes old idx
## Using contains
df.loc[ df["Name"].str.contains("Mega") ]
df.loc[ ~df["Name"].str.contains("Mega") ] # take other set
df.loc[ df["Name"].str.contains("Fire|Grass", regex=True) ]
df.loc[ df["Name"].str.contains("pi[a-z]*", regex=True) ]
#### Conditional Changes
## Change Type1 column having entry "fire" to "flamer"
df.loc[ df["Type 1"] ] == "Fire", "Type 1"] = "Flamer"
## Change two columns at the same time.
df.loc[ df["Total"] > 500, ["Generation", "Legendary"] ] = ["Test 1", "Test2"]
## Aggregate data using groupby
df.groupby( ["Type 1"] ).mean()
df.groupby("Type 1")['HP'].sum()
df.groupby( ["Type 1"] ).mean().sort_values("Defense", ascending=False)
df.groupby( ["Type 1"] ).count()
### Working with large data
for df in pd.read_csv("modified.csv", chunksize=5000)
print(df)
\end{python}
\subsection{calculate column cumulatives}
\begin{python}
import pandas as pd
df = pd.DataFrame(data=[[1, 2, 7, 10], [10, 22, 1, 30],
[30, 42, 2, 10], [100,142, 22,1]],
columns=['Start','End','Value1','Value2'])
df2 = df[['Value1', 'Value2']].cumsum()
df2.rename(columns={'Value1': 'Cumulative Value1', 'Value2': 'Cumulative Value2'},
inplace=True)
print(df2)
\end{python}
\subsection{operations on two data frames}
\begin{python}
import numpy as np
import pandas as pd
df = pd.DataFrame(data=[[1, 2, 7, 10], [10, 22, 1, 30],
[30, 42, 2, 10], [100,142, 22,1]],
columns=['Value1','Value2','Value3','Value4'])
df2 = pd.DataFrame(data=[[10, 20, 30, 40], [5, 1, 6, 32],
[143, 152, 2, 10], [np.nan, 162, 12, 11]],
columns=['Value1','Value2','Value3','Value4'])
## add dataframes
df + df2
## replaces missing values with 0 while adding
df.add(df2, fill_value=0)
## check whether df > df2. Result is a boolean filled data frame.
## eq, ne, lt, gt, le, and ge are the functions here.
## their usage is the same.
df.gt(df2)
\end{python}
\subsubsection{Concat join rows}
\begin{python}
import pandas as pd
df = pd.DataFrame(data=[[10, 20, 30], [11, 21, 31]], columns=['Key1','Key2', 'Key3'] )
df2 = pd.DataFrame(data=[[5, 6, 7], [5, 8, 12]], columns=['Key1','Key2', 'Key3'] )
df3 = pd.concat([df, df2], axis=0)
df3.reset_index(drop=True) # otherwise indexes get mixed
# Key1 Key2 Key3
#0 10 20 30
#1 11 21 31
#2 5 6 7
#3 5 8 12
\end{python}
\subsubsection{Concat join rows with different columns}
\begin{python}
import pandas as pd
df = pd.DataFrame(data=[[10, 20, 30], [11, 21, 31]], columns=['Key1','Key2', 'Key3'] )
df2 = pd.DataFrame(data=[[5, "Lazy"], [5, "Hardworking"]],columns=['Key4','Key5'] )
pd.concat([df, df2], axis=0)
# Key1 Key2 Key3 Key4 Key5
#0 10.0 20.0 30.0 NaN NaN
#1 11.0 21.0 31.0 NaN NaN
#0 NaN NaN NaN 5.0 Lazy
#1 NaN NaN NaN 5.0 Hardworking
\end{python}
\subsubsection{Concat join columns}
\begin{python}
import pandas as pd
df = pd.DataFrame(data=[[10, 20, 30], [11, 21, 31]], columns=['Key1','Key2', 'Key3'] )
df2 = pd.DataFrame(data=[[5, "Lazy"], [5, "Hardworking"]], columns=['Key4','Key5'] )
pd.concat([df, df2], axis=1)
# Key1 Key2 Key3 Key4 Key5
#0 10 20 30 5 Lazy
#1 11 21 31 5 Hardworking
\end{python}
\subsection{applying a function to dataFrame rows or columns}
\begin{python}
import numpy as np
df = pd.DataFrame(data=[["Kevin", 2, 6.], ["Frank", 22, 8.],
["Sarah", 4, 5.], ["Galvin", 3, 10.]],
columns=['Name','Years','Ability'])
print(df)
# Name Years Ability
#0 Kevin 2 6
#1 Frank 22 8
#2 Sarah 4 5
#3 Galvin 3 10
## sum rows of the frame
df1 = df.apply(np.sum, axis=0)
print(df1)
#Name KevinFrankSarahGalvin
#Years 31
#Ability 29
## sum columns of the frame
df2 = df[["Years", "Ability"]].apply(np.sum, axis=1)
print(df2)
#0 8.0
#1 30.0
#2 9.0
#3 13.0
\end{python}
\subsection{plot values with dates on x axis}
\begin{python}
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(data=[["10-06-2022", 5], ["09-06-2022", 3], ["11-06-2022", 20],
["13-06-2022", 12],["12-06-2022", 15], ["14-06-2022", 7]], columns=['Date','Sales'])
df["time"] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
##df["time"] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S.%f')
df.set_index(['time'],inplace=True)
df.plot()
\end{python}
\subsection{Example code 1:}
\begin{python}
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("examples/4weeks_date.csv")
##see the columns
df.columns
##strip the column names to remove extra whitespaces
df = df.rename(columns=lambda x: x.strip())
##check dataset
df.describe()
df.info()
print("initial length of dataset %d " % len(df) )
##drop rows when ENTRIES or EXITS is zero
df_clean = df.query("ENTRIES > 0")
df_clean = df_clean.query("EXITS > 0")
print("length after cleaning 1 %d " % len(df_clean) )
##drop entries bigger than 5M
df_clean = df_clean.query("ENTRIES < 5000000")
df_clean = df_clean.query("EXITS < 5000000")
df_clean.reset_index(drop = True, inplace = True)
print("length after cleaning 2 %d " % len(df_clean) )
##compute total activity : ENTRIES + EXITS
df_clean["TA"] = df_clean["ENTRIES"] + df_clean["EXITS"]
##combine date and time columns. Then convert to pdDate
df_clean["DT"] = df_clean["DATE"] + " " + df_clean["TIME"]
df_clean["DATETIME"] = pd.to_datetime(df_clean['DT'], format='%m/%d/%Y %H:%M:%S')
##select a station and sum same day activities
df_clean["LINENAME"].value_counts()
usedLineName = "1237ACENQRS"
myLineDF = df_clean[ df_clean["LINENAME"] == usedLineName]
myLineDF.reset_index(drop = True, inplace = True)
##sort rows with DATE + TIME
myLineDF = myLineDF.sort_values( ["DATE","TIME"] )
myLineDF.reset_index(drop = True, inplace = True)
##add rows with identical DATETIME
myLineDFGrouped = myLineDF.groupby("DATETIME").mean()
myLineDFGrouped.reset_index(drop = True, inplace = True)
## Create figure and plot space
fig, ax = plt.subplots(figsize=(10, 10))
ax.plot(myLineDFGrouped.index , myLineDFGrouped["TA"], color='purple')
ax.set(xlabel="Date", ylabel="Total activity", title="Total entries for line: %s" % usedLineName )
plt.show()
## Alternatively use default Plotter
myLineDFGrouped.plot()
\end{python}
\section{Matplotlib Visualization}
\subsection{Basics}
\subsubsection{Basic plotting}
\begin{python}
from matplotlib import pyplot as plt
plt.figure(figsize = (20,9))
plt.plot()
\end{python}
\section{Seaborn Visualization}
Seaborn is a visualization library on top of Matplotlib. Seaborn is more comfortable in handling Pandas data frames.
\subsection{Plotting an histogram}
\begin{python}
import matplotlib.pyplot as plt
import seaborn as sns
## plot only density
sns.distplot([0, 1, 2, 3, 4, 5], hist=False)
plt.show()
## plot density + histogram
sns.distplot([0, 1, 2, 3, 3, 3, 3, 4, 5, 7])
plt.show()
\end{python}
\subsection{Lineplot}
\begin{python}
import matplotlib.pyplot as plt
import seaborn as sns
## loading dataset
data = sns.load_dataset("iris")
# [150 rows x 5 columns]
### draw lineplot
sns.lineplot(x="sepal_length", y="sepal_width", data=data)
## setting the x limit of the plot
plt.xlim(5)
\end{python}
\subsection{Scatterplot}
\begin{python}
# importing packages
import seaborn as sns
import matplotlib.pyplot as plt
# loading dataset
data = sns.load_dataset("iris")
sns.scatterplot(x='sepal_length', y='sepal_width', data=data)
plt.show()
\end{python}
\section{OpenCV}
\subsection{Basics}
\subsubsection{Typecase PIL Image to OpenCV image}
\begin{python}
from PIL import Image
import cv2
import matplotlib.pyplot as plt
pillowImage = Image.open("kiz-kulesi.jpg")
rgb_image_float= np.asarray(pillowImage,dtype=float)/255.0
plt.imshow(rgb_image_float)
\end{python}
3 years ago
3 years ago
\subsubsection{Draw circle on PIL Image}
3 years ago
\begin{python}
from PIL import Image, ImageDraw
image = Image.new('RGBA', (200, 200))
draw = ImageDraw.Draw(image)
3 years ago
# bounding box coordinates for the ellipse topleft, bot right (x1, y1, x2, y2)
3 years ago
draw.ellipse((20, 20, 180, 180), fill = 'blue', outline ='blue')
draw.point((100, 100), 'red')
image.save('test.png')
\end{python}
3 years ago
\subsubsection{Draw text on PIL Image}
\begin{python}
from PIL import Image, ImageDraw
image = Image.new('RGBA', (200, 200))
draw = ImageDraw.Draw(image)
# drawing text size
text = "hello"
font = ImageFont.truetype(r'C:\Users\System-Pc\Desktop\arial.ttf', 20)
3 years ago
## ubuntu -- font can be found with fc-list command
3 years ago
draw.text((5, 5), text, fill ="red", font = font, align ="right")
image.save('test.png')
\end{python}
3 years ago
\subsubsection{Flip, resize, rotate, crop images}
\begin{python}
import cv2
import scipy.ndimage
import numpy as np
import matplotlib.pyplot as plt
original_image = cv2.imread("kiz-kulesi.jpg", cv2.IMREAD_GRAYSCALE)
flipud_image=np.flipud(original_image)
fliplr_image=np.fliplr(original_image)
rotated_image=scipy.ndimage.rotate(original_image,45)
resized_image=scipy.misc.imresize(original_image, 0.5, interp='bilinear', mode=None)
rows,cols=original_image.shape
croped_image = original_image[int(rows / 3): -int(rows / 3), int(cols / 4): - int(cols / 4)]
fig1, axes_array = plt.subplots(2, 3)
fig1.set_size_inches(9,6)
image_plot = axes_array[0][0].imshow(original_image ,cmap=plt.cm.gray)
axes_array[0][0].set(title='Original')
image_plot = axes_array[0][1].imshow(flipud_image,cmap=plt.cm.gray)
axes_array[0][1].set(title='Flipped up-down')
image_plot = axes_array[0][2].imshow(fliplr_image,cmap=plt.cm.gray)
axes_array[0][2].set(title='Flipped left-right')
image_plot = axes_array[1][0].imshow(rotated_image,cmap=plt.cm.gray)
axes_array[1][0].set(title='Rotated')
image_plot = axes_array[1][1].imshow(resized_image,cmap=plt.cm.gray)
axes_array[1][1].set(title='Resized')
image_plot = axes_array[1][2].imshow(croped_image,cmap=plt.cm.gray)
axes_array[1][2].set(title='Cropped')
plt.show()
\end{python}
\subsubsection{Operating on HSV colorspace}
\begin{python}
import matplotlib
import matplotlib.pyplot as plt
def demo_rgb_to_hsv(original_image,reduce_intensity_factor=0.5):
original_rgb_float= np.asarray(original_image,dtype=float)/255.0
original_rgb_float = original_rgb_float[:,:,:3]
hsv_image=matplotlib.colors.rgb_to_hsv(original_rgb_float)
hsv_image_processed=hsv_image.copy()
hsv_image_processed[:,: ,2]=hsv_image[:,: ,2]*reduce_intensity_factor
rgb_image_processed=matplotlib.colors.hsv_to_rgb(hsv_image_processed)
fig1, axes_array = plt.subplots(1, 2)
fig1.set_size_inches(8,4)
image_plot = axes_array[0].imshow(original_rgb_float) # Show the RGB image
axes_array[0].axis('off')
axes_array[0].set(title='RGB Image')
image_plot = axes_array[1].imshow(rgb_image_processed) # Show the gray image
axes_array[1].axis('off')
axes_array[1].set(title='Intensity Reduced Image')
plt.show()
rgb_image_int = Image.open("kiz-kulesi.jpg")
demo_rgb_to_hsv(rgb_image_int)
\end{python}
\subsubsection{1d Gaussian Kernel}
\begin{python}
def display_1d_gaussian(mean=0.0,sigma=0.5):
x=np.linspace(-10,10,1000)
y= (1/np.sqrt(2*np.pi*sigma**2))*np.exp(-((x-mean)**2)/(2*sigma**2))
fig, axes1 = plt.subplots(1, 1)
fig.set_size_inches(6,3)
axes1.set(xlabel="X",ylabel="Y",title='Gaussian Curve',ylim=(0,1))
plt.grid(True)
axes1.plot(x,y,color='gray')
plt.fill_between(x,y,0,color='#c0f0c0')
plt.show()
\end{python}
\subsubsection{2d Gaussian Kernel Image}
\begin{python}
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
def display_gaussian_kernel(sigma=1.0):
X = np.linspace(-5, 5, 400)
Y = np.linspace(-5, 5, 400)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)
mu = np.array([0.0, 0.0])
covariance = np.diag(np.array([sigma, sigma])**2)
XY = np.column_stack([X.flat, Y.flat])
z = scipy.stats.multivariate_normal.pdf(XY, mean=mu, cov=covariance)
Z = z.reshape(X.shape)
# Plot the surface.
fig = plt.figure()
fig.set_size_inches(8,4)
ax1 = fig.add_subplot(121)
ax1.imshow(Z)
ax2 = fig.add_subplot(122, projection='3d')
surf = ax2.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
# Customize the z axis.
ax2.set_zlim(0, .2)
ax2.zaxis.set_major_locator(LinearLocator(10))
ax2.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
display_gaussian_kernel()
\end{python}
\subsubsection{find horizontal lines}
\begin{python}
import cv2
# Load image, convert to grayscale, Otsu's threshold
image = cv2.imread('kiz-kulesi.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Detect horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(result, [c], -1, (36,255,12), 2)
plt.figure(figsize = (20,9)); plt.imshow(result)
\end{python}
\subsubsection{Drawing text on image}
\begin{python}
import cv2
# path
path = r'kiz-kulesi.jpg'
# Reading an image in default mode
image = cv2.imread(path)
# Window name in which image is displayed
window_name = 'Image'
# font
font = cv2.FONT_HERSHEY_SIMPLEX
# position
org = (40, 40)
# fontScale
fontScale = 1
# Green color in BGR
color = (0, 255, 0)
# Line thickness of 2 px
thickness = 2
image = cv2.putText(image, 'Hello', org, font, fontScale, color, thickness, cv2.LINE_AA)
# Displaying the image
plt.figure(figsize = (20,9)); plt.imshow(img_rgb)
\end{python}
\subsection{Template Matching}
\begin{python}
from matplotlib import pyplot as plt
import numpy as np
import cv2
import imutils
def multiscaleTemplateMatching(imFileToLoad,templateFileToLoad):
template = cv2.imread(templateFileToLoad)
template = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
template = cv2.Canny(template, 50, 200)
(tH, tW) = template.shape[:2]
# loop over the images to find the template in
image = cv2.imread(imFileToLoad)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
found = None
# loop over the scales of the image
for scale in np.linspace(0.2, 1.0, 20)[::-1]:
# resize the image according to the scale, and keep track
# of the ratio of the resizing
resized = imutils.resize(gray, width = int(gray.shape[1] * scale))
r = gray.shape[1] / float(resized.shape[1])
# if the resized image is smaller than the template, then break
# from the loop
if resized.shape[0] < tH or resized.shape[1] < tW:
break
# detect edges in the resized, grayscale image and apply template
# matching to find the template in the image
edged = cv2.Canny(resized, 50, 200)
result = cv2.matchTemplate(edged, template, cv2.TM_CCOEFF)
(_, maxVal, _, maxLoc) = cv2.minMaxLoc(result)
# check to see if the iteration should be visualized
if True:
# draw a bounding box around the detected region
clone = np.dstack([edged, edged, edged])
cv2.rectangle(clone, (maxLoc[0], maxLoc[1]),
(maxLoc[0] + tW, maxLoc[1] + tH), (0, 0, 255), 2)
# if we have found a new maximum correlation value, then update
# the bookkeeping variable
if found is None or maxVal > found[0]:
found = (maxVal, maxLoc, r)
# unpack the bookkeeping variable and compute the (x, y) coordinates
# of the bounding box based on the resized ratio
(_, maxLoc, r) = found
(startX, startY) = (int(maxLoc[0] * r), int(maxLoc[1] * r))
(endX, endY) = (int((maxLoc[0] + tW) * r), int((maxLoc[1] + tH) * r))
# draw a bounding box around the detected result and display the image
return [startX, startY, endX, endY]
img_rgb = cv2.imread(shot)
startX, startY, endX, endY = multiscaleTemplateMatching(shot,templateFileToLoad)
cv2.rectangle(img_rgb, (startX, startY), (endX, endY), (0, 0, 255), 2)
plt.figure(figsize = (20,9))
plt.imshow(img_rgb)
\end{python}
\subsection{Overlapping bounding box removal: nonmaxima suppression}
\begin{python}
def NMS(boxes, overlapThresh = 0.4):
# Return an empty list, if no boxes given
if len(boxes) == 0:
return []
x1 = boxes[:, 0] # x coordinate of the top-left corner
y1 = boxes[:, 1] # y coordinate of the top-left corner
x2 = boxes[:, 2] # x coordinate of the bottom-right corner
y2 = boxes[:, 3] # y coordinate of the bottom-right corner
# Compute the area of the bounding boxes and sort the bounding
# Boxes by the bottom-right y-coordinate of the bounding box
areas = (x2 - x1 + 1) * (y2 - y1 + 1) # We add 1, because the pixel at the start as well as at the end counts
# The indices of all boxes at start. We will redundant indices one by one.
indices = np.arange(len(x1))
for i,box in enumerate(boxes):
# Create temporary indices
temp_indices = indices[indices!=i]
# Find out the coordinates of the intersection box
xx1 = np.maximum(box[0], boxes[temp_indices,0])
yy1 = np.maximum(box[1], boxes[temp_indices,1])
xx2 = np.minimum(box[2], boxes[temp_indices,2])
yy2 = np.minimum(box[3], boxes[temp_indices,3])
# Find out the width and the height of the intersection box
w = np.maximum(0, xx2 - xx1 + 1)
h = np.maximum(0, yy2 - yy1 + 1)
# compute the ratio of overlap
overlap = (w * h) / areas[temp_indices]
# if the actual boungding box has an overlap bigger than threshold with any other box, remove it's index
if np.any(overlap) > overlapThresh:
indices = indices[indices != i]
#return only the boxes at the remaining indices
return boxes[indices].astype(int)
\end{python}
\subsection{SingleScale Multiple Template Matching}
\begin{python}
import cv2
def singleScaleMultipleTemplateMatching(imageFileName, templateFileName):
print("[INFO] loading images...")
image = cv2.imread(imageFileName)
img_rgb = image.copy()
template = cv2.imread(templateFileName)
(tH, tW) = template.shape[:2]
# convert both the image and template to grayscale
imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
templateGray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
# perform template matching
print("[INFO] performing template matching...")
result = cv2.matchTemplate(imageGray, templateGray,
cv2.TM_CCOEFF_NORMED)
(yCoords, xCoords) = np.where(result >= 0.75)
clone = image.copy()
print("[INFO] {} matched locations *before* NMS".format(len(yCoords)))
# loop over our starting (x, y)-coordinates
for (x, y) in zip(xCoords, yCoords):
# draw the bounding box on the image
cv2.rectangle(clone, (x, y), (x + tW, y + tH),
(255, 0, 0), 3)
# initialize our list of rectangles
rects = []
# loop over the starting (x, y)-coordinates again
for (x, y) in zip(xCoords, yCoords):
# update our list of rectangles
rects.append((x, y, x + tW, y + tH))
# apply non-maxima suppression to the rectangles
pick = NMS(np.array(rects))
# pick = rects
print("[INFO] {} matched locations *after* NMS".format(len(pick)))
# loop over the final bounding boxes
for (startX, startY, endX, endY) in pick:
# draw the bounding box on the image
cv2.rectangle(img_rgb, (startX, startY), (endX, endY),
(0, 255, 0), 2)
return pick, img_rgb
pick , img_rgb = singleScaleMultipleTemplateMatching("cropped2.png","template.png")
plt.figure(figsize = (20,9))
plt.imshow(img_rgb)
\end{python}
\subsection{Finding and Plotting Contours}
\begin{python}
def findAndPlotContours(fileName, blob_area_thresh=20):
img = cv2.imread(fileName, cv2.IMREAD_COLOR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 101, 3)
### following morphology open and close can be applied.
#kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5,5))
#blob = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
#kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (9,9))
#blob = cv2.morphologyEx(blob, cv2.MORPH_CLOSE, kernel)
blob = thresh
# invert blob
blob = (255 - blob)
# Get contours
cnts = cv2.findContours(blob, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
## select the contours larger than having area 20
cnts = [c for c in cnts if cv2.contourArea(c) > blob_area_thresh]
#big_contour = max(cnts, key=cv2.contourArea)
## return contours and buffer image
result = img.copy()
result[:,:,0] = 255
result[:,:,1] = 255
result[:,:,2] = 255
for c in cnts:
cv2.drawContours(result, [c], -1, (0,0,255), 1)
return result, c
result, c = findAndPlotContours("kiz-kulesi.jpg",20)
\end{python}
\subsection{Circle Detection}
\begin{python}
import matplotlib.pyplot as plt
import cv2
img = cv2.imread("cropped2.png", cv2.IMREAD_COLOR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
detected_circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1, 20, param1 = 50, param2 = 30, minRadius = 1, maxRadius = 400)
for pt in detected_circles[0, :]:
a, b, r = pt[0], pt[1], pt[2]
# Draw the circumference of the circle.
cv2.circle(img, (a, b), r, (0, 255, 0), 2)
# Draw a small circle (of radius 1) to show the center.
cv2.circle(img, (a, b), 1, (0, 0, 255), 3)
plt.figure(figsize = (20,9))
plt.imshow(img)
\end{python}
\subsection{Connected Components Analysis}
\begin{python}
import matplotlib.pyplot as plt
import cv2
img = cv2.imread("cropped2.png", cv2.IMREAD_COLOR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
analysis = cv2.connectedComponentsWithStats(threshold, 4, cv2.CV_32S)
(totalLabels, label_ids, values, centroid) = analysis
#plt.figure(figsize = (20,9))
#plt.imshow(threshold)
# Loop through each component
output = np.zeros(gray.shape, dtype="uint8")
for i in range(1, totalLabels):
area = values[i, cv2.CC_STAT_AREA]
if (area > 110) and (area < 900):
# Labels stores all the IDs of the components on the each pixel
# It has the same dimension as the threshold
# So we'll check the component
# then convert it to 255 value to mark it white
componentMask = (label_ids == i).astype("uint8") * 255
# Creating the Final output mask
output = cv2.bitwise_or(output, componentMask)
plt.figure(figsize = (20,9))
plt.imshow(output)
\end{python}
\subsection{Fit ellipses to objects}
\begin{python}
img = cv2.imread("cropped2.png", cv2.IMREAD_COLOR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, s, 7.0)
cnts, hier = cv2.findContours(gray,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
ellipses = []
if len(cnts) != 0:
for i in range(len(cnts)):
if len(cnts[i]) >= 5:
ellipse=cv2.fitEllipse(cnts[i])
print(ellipse)
ellipses.append(ellipse)
centCoord = ( int(ellipse[0][0]), int(ellipse[0][1]) )
axisLen = ( int(ellipse[1][0]), int(ellipse[1][1]) )
angle = ellipse[2]
## angle filtering
#offSet = np.min( np.fabs( [angle, angle-90, angle-180, angle-270, angle-360]) )
#if offSet < 5:
img = cv2.ellipse(img, centCoord, axisLen, angle, 0, 360, (0,0,255))
# cv2.drawContours(img,cnts,-1,(150,10,255),2)
plt.figure(figsize = (20,9))
plt.imshow(img)
\end{python}
\section{Numpy}
\subsection{Fitting}
\subsubsection{Curve fitting}
\begin{python}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
def generateQuadraticData():
x = np.random.rand(100) - 0.5
u = 0.1 * np.random.rand(100)
a = 1
b = 0.0
c = 0.1
y = a * np.multiply(x,x) + np.multiply(x,b) + c + u
#plt.scatter(x,y)
return x,y
x,y = generateQuadraticData()
X = x.reshape(-1,1) # for one feature problems
###We compare nonlinear regression here with different power
regr = LinearRegression()
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)
X_fit = np.arange(X.min(), X.max(), 0.05)[:, np.newaxis]
regr = regr.fit(X, y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))
regr = regr.fit(X_quad, y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y, regr.predict(X_quad))
regr = regr.fit(X_cubic, y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))
plt.scatter(X, y, label='training points', color='lightgray')
plt.plot(X_fit, y_lin_fit,
label='linear (d=1), $R^2=%.2f$' % linear_r2,
color='blue',
lw=2,
linestyle=':')
plt.plot(X_fit, y_quad_fit,
label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,
color='red',
lw=2,
linestyle='-')
plt.plot(X_fit, y_cubic_fit,
label='cubic (d=3), $R^2=%.2f$' % cubic_r2,
color='green',
lw=2,
linestyle='--')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()
\end{python}
\subsubsection{Ransac curve fitting}
\begin{python}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn import linear_model, datasets
def generateQuadraticData(noiseFraction = 0.2):
nDataPoints = 100
nNoisePoints = int(nDataPoints * noiseFraction)
## data
a, b, c = 1, 0.0, 0.1
x = np.random.rand(nDataPoints) - 0.5
y = a * np.multiply(x,x) + np.multiply(x,b) + c
## outliers
u = np.random.rand(nNoisePoints, 2)
x = np.concatenate( (x,u[:,0]), axis=0)
y = np.concatenate( (y,u[:,1]), axis=0)
# plt.scatter(x,y)
return x,y
noiseFraction = 0.7
x,y = generateQuadraticData(noiseFraction)
## 1D data reshape.
X = x.reshape(-1,1) # for one feature problems
quadratic = PolynomialFeatures(degree=2)
X_quad = quadratic.fit_transform(X)
X_fit = np.arange(2*np.min(X), 2*np.max(X), 0.05)[:, np.newaxis] #evaluation interval
regr = linear_model.RANSACRegressor()
regr = regr.fit(X_quad, y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y, regr.predict(X_quad))
inlier_mask = regr.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
plt.figure(figsize = (10,9))
plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,color='blue', lw=1,linestyle='-')
plt.scatter(X[inlier_mask], y[inlier_mask], color="green", marker=".", label="Inliers")
plt.scatter(X[outlier_mask], y[outlier_mask], color="red", marker=".", label="Outliers")
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()
\end{python}
\section{Scikit-Learn}
Scikit learn supports many machine learning models.
\subsection{Linear Regression}
\begin{python}
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
\end{python}
%\section*{Appendix}
%\bibliographystyle{plain}
%\bibliography{references}
\end{document}