风之空灵: 七月 2021

7/19/2021

SQL

Mysql

/usr/local/mysql/bin/mysql -uroot -p

sql不分大小写！

SELECT * FROM people LIMIT 10;

SELECT DISTINCT language FROM films;

SELECT COUNT(*) FROM people;

SELECT COUNT(birthdate) FROM people;

SELECT COUNT(DISTINCT birthdate) FROM people;

SELECT title FROM films WHERE title = 'Metropolis'; must be single quot

SELECT * FROM films WHERE budget > 10000;

Use <> but not !=

SELECT title FROM films WHERE release_year > 1994 AND release_year < 2000;

SELECT title FROM films WHERE release_year > 1994 AND < 2000;

SELECT title FROM films WHERE (release_year = 1994 OR release_year = 1995) AND (certification = 'PG' OR certification = 'R');

SELECT title FROM films WHERE release_year BETWEEN 1994 AND 2000;

SELECT name FROM kids WHERE age IN (2, 4, 6, 8, 10);

SELECT name FROM people WHERE birthdate IS NOT NULL;

SELECT name FROM companies WHERE name LIKE 'Data%';

SELECT AVG(budget) FROM films;

MAX () MIN() SUM() AVG(), but cant use after WHERE clause

SELECT (4.0 / 3.0) AS result; 1.333

SELECT (4 / 3); = 1

Can perform + - * /

SELECT MAX(budget) AS max_budget,

MAX(duration) AS max_duration

FROM films;

SELECT AVG(duration/60.0) AS avg_duration_hours

FROM films; 必须是60.0 不然不行

SELECT title FROM films

ORDER BY release_year, title DESC;

SELECT sex, count(*)

FROM employees

GROUP BY sex

ORDER BY count DESC;

SELECT release_year

FROM films

GROUP BY release_year

HAVING COUNT(title) > 10;

SELECT title, imdb_score FROM films

JOIN reviews

ON films.id = reviews.film_id

WHERE title = 'To Kill a Mockingbird';

SELECT cities.name AS city, countries.name AS country

FROM cities

INNER JOIN countries

ON cities.country_code = countries.code;

SELECT c1.name AS city, c2.name AS country

FROM cities AS c1

INNER JOIN countries AS c2

ON c1.country_code = c2.code;

SELECT *

FROM left_table

INNER JOIN right_table

ON left_table.id = right_table.id

INNER JOIN another_table

ON left_table.id = another_table.id;

SELECT *

FROM countries

INNER JOIN economies

USING(code)

SELECT name, continent, code, surface_area,

CASE WHEN surface_area > 2000000

THEN 'large'

WHEN surface_area > 350000

THEN 'medium'

ELSE 'small' END

AS geosize_group

INTO countries_plus

FROM countries;

Multiple line comment /* And */

Left join, right join, full join

SELECT NAME

FROM cities

EXCEPT / intersect / Union/ Union all

SELECT capital

FROM countries

ORDER BY name;

Semi-join

SELECT DISTINCT name

FROM languages

WHERE code IN

(SELECT code

FROM countries

WHERE region = 'Middle East')

ORDER BY name;

SELECT max(inflation_rate) as max_inf

FROM (

SELECT name, continent, inflation_rate

FROM countries

INNER JOIN economies

USING (code)

WHERE year = 2015) AS subquery

GROUP BY continent;

SELECT name, continent, inflation_rate

FROM countries

INNER JOIN economies

USING (code)

WHERE year = 2015

and inflation_rate IN (

SELECT MAX(inflation_rate) AS max_inf

FROM (

SELECT name, continent, inflation_rate

FROM countries

INNER JOIN economies

USING (code)

WHERE year = 2015) AS subquery

GROUP BY continent);

SELECT DISTINCT name, total_investment, imports
  FROM countries AS c
    LEFT JOIN economies AS e
      ON (c.code = e.code
        AND c.code IN (
          SELECT l.code
          FROM languages AS l
          WHERE official = 'true'
        ) )
  WHERE region = 'Central America' AND year = 2015
ORDER BY name;

 
 
SELECT region, continent, AVG(fertility_rate) AS avg_fert_rate
  FROM countries AS c
    INNER JOIN populations AS p
      ON c.code = p.country_code
  WHERE year = 2015
GROUP BY region, continent
ORDER BY avg_fert_rate;
 
SELECT cities.name, cities.country_code, city_proper_pop, metroarea_pop,  
      city_proper_pop / metroarea_pop * 100.0 AS city_perc
  FROM cities 
  WHERE name IN
    (SELECT capital
     FROM countries
     WHERE continent = 'Europe'
        OR continent LIKE '%America')
        AND metroarea_pop IS NOT NULL
ORDER BY city_perc DESC
LIMIT 10
 
SELECT *
FROM table
WHERE 
         CASE WHEN a > 5 THEN 'Keep'
         WHEN a <= 5 THEN 'Exclude' END = 'Keep'; 
 
SELECT 
	season,
        date,
	home_goal,
	away_goal
FROM matches_italy
WHERE 
-- Exclude games not won by Bologna
	CASE WHEN hometeam_id = 9857 and home_goal > away_goal THEN 'Bologna Win'
		WHEN awayteam_id = 9857 AND away_goal > home_goal THEN 'Bologna Win' 
		END IS NOT NULL; 

SELECT 
	c.name AS country,
    -- Count games from the 2012/2013 season
	COUNT (CASE WHEN m.season = '2012/2013' 
        	THEN m.id ELSE NULL END) AS matches_2012_2013
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
-- Group by country name alias
GROUP BY country;

 -- percentage of game tied
SELECT 
	c.name AS country,
    -- Round the percentage of tied games to 2 decimal points
     ROUND(AVG(CASE WHEN m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1
			 WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0
			 END),2) AS pct_ties_2013_2014,
	ROUND(AVG(CASE WHEN m.season='2014/2015' AND m.home_goal = m.away_goal THEN 1
			 WHEN m.season='2014/2015' AND m.home_goal != m.away_goal THEN 0
			 END),2) AS pct_ties_2014_2015
FROM country AS c
LEFT JOIN matches AS m
ON c.id = m.country_id
GROUP BY country;


Windows:SELECT
	m.away_goal,
	AVG(m.home_goal + m.away_goal) over() AS overall_avg
FROM match AS m

SELECT 
    id,
    RANK() OVER(ORDER BY home_goal) AS rank
FROM match;

rank() over(ORDER BY AVG(m.home_goal + m.away_goal)) AS league_rank
ORDER BY league_rank;

SELECT 
    id,
    RANK() OVER(ORDER BY home_goal DESC) AS rank
FROM match;

    AVG(home_goal) OVER(PARTITION BY season, 
         	EXTRACT(month FROM date)) AS season_mo_home,


Sliding to the left:SELECT 
	date,
	home_goal,
	away_goal,
    -- Create a running total and running average of home goals
    SUM(home_goal) over(ORDER BY DATE
         rows between unbounded preceding and current row) AS running_total,
    AVG(home_goal) OVER(ORDER BY date 
         rows between unbounded preceding and current row) AS running_avg
FROM match
WHERE 
	hometeam_id = 9908 
	AND season = '2011/2012';

Slide to the right:-- Create a running total and running average of home goals
    SUM(home_goal) over(ORDER BY DATE DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total,
    AVG(home_goal) OVER(ORDER BY DATE DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg



-- Set up the home team CTE
with home AS(
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS(
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
		   WHEN m.home_goal < m.away_goal THEN 'MU Win' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select columns and and rank the matches by date
SELECT DISTINCT
    m.DATE,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal, m.away_goal,
    rank() over(ORDER BY ABS(home_goal - away_goal) DESC) as match_rank
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND ((home.team_long_name = 'Manchester United' AND home.outcome = 'MU Loss')
      OR (away.team_long_name = 'Manchester United' AND away.outcome = 'MU Loss'));

7/15/2021

awk print the first 4 and last 4 letters from a string

cat tmp | awk '{print $1","substr($1,1,4)"_"substr($1,length($1)-3,length($1))}' > input_list_combine.table

7/14/2021

Data cleaning

Df[sth].dtype()

Df[sth].astype(‘category’)

Assert 1+1 == 2

$ from fuzzywuzzy import fuzz

$ fuzz.WRaio("Reeding", "Reading")

Matches.index shows column names

7/10/2021

Import

$ data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)

$ np.recfromcsv(‘file_name’, nrows= 5, header = None)

$ pd.read_csv()

$ pd.read_table()

data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

Magics:

!ls

import os

wd = os.getcwd()

os.listdir(wd)

df1 = xls.parse(0, skiprows= 1, names=['Country', 'AAM due to War (2002)'])

df2 = xls.parse(0, usecols= 0, skiprows=1, names=['z', 'A'])

# Get the HDF5 group: group

group = data['strain']

# Check out keys of group

for key in group.keys():

print(key)

# Set variable equal to time series data: strain

strain = data['strain']['Strain'].value

# Set number of time points to sample: num_samples

num_samples = 10000

# Set time vector

time = np.arange(0, 1, 1/num_samples)

# Plot data

plt.plot(time, strain[:num_samples])

plt.xlabel('GPS Time (s)')

plt.ylabel('strain')

plt.show()

# Print the keys of the MATLAB dictionary

print(mat.keys())

# Print the type of the value corresponding to the key 'CYratioCyt'

print(type(mat['CYratioCyt']))

# Print the shape of the value corresponding to the key 'CYratioCyt'

print(mat['CYratioCyt'].shape)

# Subset the array and plot it

data = mat['CYratioCyt'][25, 5:]

fig = plt.figure()

plt.plot(data)

plt.xlabel('time (min.)')

plt.ylabel('normalized fluorescence (measure of expression)')

plt.show()

# Import necessary module

from sqlalchemy import create_engine

# Create engine: engine

engine = create_engine('sqlite:///Chinook.SQLite')

# Import necessary module

from sqlalchemy import create_engine

# Create engine: engine

engine = create_engine('sqlite:///Chinook.sqlite')

# Save the table names to a list: table_names

table_names = engine.table_names()

# Print the table names to the shell

print(table_names)

# Import packages

from sqlalchemy import create_engine

import pandas as pd

# Create engine: engine

engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine connection: con

con = engine.connect()

# Perform query: rs

rs = con.execute("SELECT * FROM album")

# Save results of the query to DataFrame: df

df = pd.DataFrame(rs.fetchall())

# Close connection

con.close()

# Print head of DataFrame df

print(df.head())

# Open engine in context manager

# Perform query and save results to DataFrame: df

with engine.connect() as con:

rs = con.execute("SELECT LastName, Title FROM Employee")

df = pd.DataFrame(rs.fetchmany(size=3))

df.columns = rs.keys()

# Open engine in context manager

# Perform query and save results to DataFrame: df

with engine.connect() as con:

rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6")

df = pd.DataFrame(rs.fetchall())

df.columns = rs.keys()

with engine.connect() as con:

rs = con.execute("SELECT * FROM Employee ORDER BY BirthDate")

df = pd.DataFrame(rs.fetchall())

# Execute query and store records in DataFrame: df

df = pd.read_sql_query("SELECT * FROM album", engine)

df = pd.read_sql_query("SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate", engine)

with engine.connect() as con:

rs=con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID")

df=pd.DataFrame(rs.fetchall())

df.columns=rs.keys()

df = pd.read_sql_query("SELECT * FROM PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId WHERE Milliseconds < 250000", engine)

7/06/2021

Manage data

Df[A & B]

Df[ A | B]

Df[ ~B] : 不是boolean B的subset

Pmf(df.col, normalize=False)

Cdf(df[age])

Q = cdf_income.inverse(0.75) : 75th percentile income

Df_subset.Corr()